In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
#plt.style.use('seaborn') #ggplot

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [None]:
training = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test_data = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")
training

In [None]:
training.isna().sum()

In [None]:
training.info()

In [None]:
training.columns

In [None]:
training.dtypes

In [None]:
training.describe()

In [None]:
training.corr()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(training.corr(), annot=True)

In [None]:
training.corr().unstack()

In [None]:
feat_corr = training.corr().unstack().sort_values()
feat_corr

In [None]:
feat_corr[(feat_corr>0.7)&(feat_corr<1)]

In [None]:
print(feat_corr[(abs(feat_corr)>0.7) & (abs(feat_corr)<1)].drop_duplicates())

In [None]:
high_corr_df = pd.DataFrame(feat_corr[(abs(feat_corr)>0.7) & (abs(feat_corr)<1)].drop_duplicates())
high_corr_df.index

In [None]:
training = training.drop(columns=['id'])

In [None]:
num_feature = training.columns[training.dtypes!='object']

def my_plot(feature):
    plt.hist(training[feature])
    plt.title(feature)
    plt.show()
        
for i in num_feature:
    my_plot(i)

In [None]:
training_clear = pd.get_dummies(training, drop_first=True)

In [None]:
x = training_clear.drop('target', axis=1)
y = pd.DataFrame(training_clear['target'])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(x_train.shape)
x_train.head()
print(y_train.shape)
y_train.head()
print(x_test.shape, y_test.shape)
scaler_x = MinMaxScaler().fit(x_train)
scaler_y = MinMaxScaler().fit(y_train)

In [None]:
x_train_sc = scaler_x.transform(x_train)
x_test_sc = scaler_x.transform(x_test)
y_train_sc = scaler_y.transform(y_train)
y_test_sc = scaler_y.transform(y_test)

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train_sc, y_train_sc)
y_pred_sc = lr_model.predict(x_test_sc)
print(y_test_sc)
print(y_pred_sc)

In [None]:
mae = mean_absolute_error(y_test_sc, y_pred_sc)
rmse = np.sqrt(mean_squared_error(y_test_sc, y_pred_sc))
print('MAE = ', mae.round(4))
print('RMSE = ', rmse.round(4))

In [None]:
y_test_inv = scaler_y.inverse_transform(y_test_sc.reshape(-1,1))
y_pred_inv = scaler_y.inverse_transform(y_pred_sc.reshape(-1,1))
actual_mae = mean_absolute_error(y_test_inv, y_pred_inv)
actual_rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

print('Actual MAE = ', float(actual_mae))
print('Actual RMSE = ', float(actual_rmse))

In [None]:
test_data_clear = test_data.drop(columns=['id'])
test_data_clear

In [None]:
test_features_to_transform = test_data_clear.loc[:,['cont0','cont4','cont5','cont6','cont7','cont8']]

In [None]:
def features_log(feature):
    test_data_clear[feature] = np.log(test_data_clear[feature]+1)
        
for i in test_features_to_transform.columns:
    features_log(i)

In [None]:
test_data_clear['cont4'] = np.sqrt(test_data_clear['cont4'])
test_data_clear['cont6'] = np.sqrt(test_data_clear['cont6'])
test_data_clear['cont7'] = np.sqrt(test_data_clear['cont7'])

In [None]:
test_data_clear = pd.get_dummies(test_data_clear, drop_first=True)

In [None]:
test_data_clear.insert(loc=30,
          column='cat6_G',
          value=0)

In [None]:
test_data_scaled =scaler_x.transform(test_data_clear)

In [None]:
test = lr_model.predict(test_data_clear)
test

In [None]:
test_inv = scaler_y.inverse_transform(test.reshape(-1,1))
test_inv

In [None]:
output = pd.DataFrame()
output['id'] = test_data.id
output['target'] = test_inv
output.to_csv('sample_submission200.csv', index=False)