### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

### Data Cleansing

In [None]:
training_data = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv', na_values='?')
training_data

In [None]:
training_data.info()

In [None]:
training_data.dtypes

In [None]:
training_data.describe()

### Data Correlation

In [None]:
training_data.corr()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(training_data.corr(), annot=True)

In [None]:
training_data.corr().unstack()

In [None]:
feature_corr = training_data.corr().unstack().sort_values()
feature_corr

In [None]:
feature_corr[(feature_corr>0.7)&(feature_corr<1)]

In [None]:
#Show Highly Correlated Features > 0.7
print(feature_corr[(abs(feature_corr)>0.7) & (abs(feature_corr)<1)].drop_duplicates())

### Features Distribution

In [None]:
training_data = training_data.drop(columns=['id'])

In [None]:
training_data.columns[training_data.dtypes!='object']

In [None]:
num_feature = training_data.columns[training_data.dtypes!='object']

def my_plot(feature):
    plt.hist(training_data[feature])
    plt.title(feature)
    plt.show()
        
for i in num_feature:
    my_plot(i)

In [None]:
training_data.skew()

In [None]:
training_data2 = training_data.copy()


In [None]:
features_to_transform = training_data2.loc[:,['cont0','cont4','cont5','cont6','cont7','cont8']]
 


In [None]:

def features_log(feature):
    training_data2[feature] = np.log(training_data2[feature]+1)
        
for i in features_to_transform.columns:
    features_log(i)

In [None]:
num_feature2 = training_data2.columns[training_data2.dtypes!='object']

def my_plot2(feature):
    plt.hist(training_data2[feature])
    plt.title(feature)
    plt.show()
        
for i in num_feature2:
    my_plot2(i)

In [None]:
training_data2.skew()

In [None]:
training_data2['cont4'] = np.sqrt(training_data2['cont4'])
training_data2['cont6'] = np.sqrt(training_data2['cont6'])
training_data2['cont7'] = np.sqrt(training_data2['cont7'])



In [None]:
training_data2.skew()

### Categorical Features Transformation

In [None]:
training_data_clear = pd.get_dummies(training_data2, drop_first=True)

In [None]:
training_data_clear.head()

In [None]:
training_data_clear.info()

### Divide Data into Train & Test

In [None]:
x = training_data_clear.drop('target', axis=1)
y = pd.DataFrame(training_data_clear['target'])

In [None]:
x.head()

In [None]:
y.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

In [None]:
print(x_train.shape)
x_train.head()

In [None]:
print(y_train.shape)
y_train.head()

In [None]:
x_test.shape, y_test.shape

### Numerical Features Scaling

In [None]:
#Normalization: (X-X.min())/(X.max()-X.min())
#Standardization: (X-X.mean())/X.std()

scaler_x = MinMaxScaler().fit(x_train)
scaler_y = MinMaxScaler().fit(y_train)

In [None]:
x_train_sc = scaler_x.transform(x_train)
x_test_sc = scaler_x.transform(x_test)

In [None]:
y_train_sc = scaler_y.transform(y_train)
y_test_sc = scaler_y.transform(y_test)

In [None]:
x_train.columns

In [None]:
y_train

In [None]:
y_train_sc

### Apply Linear Regression

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train_sc, y_train_sc)
y_pred_sc = lr_model.predict(x_test_sc)

In [None]:
y_test_sc

In [None]:
y_pred_sc

In [None]:
mae = mean_absolute_error(y_test_sc, y_pred_sc)
rmse = np.sqrt(mean_squared_error(y_test_sc, y_pred_sc))

print('MAE = ', mae.round(4))
print('RMSE = ', rmse.round(4))

In [None]:
y_test_inv = scaler_y.inverse_transform(y_test_sc.reshape(-1,1))
y_pred_inv = scaler_y.inverse_transform(y_pred_sc.reshape(-1,1))

actual_mae = mean_absolute_error(y_test_inv, y_pred_inv)
actual_rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

print('Actual MAE = ', int(actual_mae))
print('Actual RMSE = ', int(actual_rmse))

In [None]:
y_test_inv

In [None]:
y_pred_inv

In [None]:
testing_data = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv', na_values='?')
testing_data

In [None]:
testing_data_clear = testing_data.drop(columns=['id'])
testing_data_clear

In [None]:
testing_data_clear.skew()

In [None]:
testing_features_to_transform = testing_data_clear.loc[:,['cont0','cont4','cont5','cont6','cont7','cont8']]


In [None]:
def features_log(feature):
    testing_data_clear[feature] = np.log(testing_data_clear[feature]+1)
        
for i in testing_features_to_transform.columns:
    features_log(i)
    
testing_data.skew()

In [None]:
testing_data_clear['cont4'] = np.sqrt(testing_data_clear['cont4'])
testing_data_clear['cont6'] = np.sqrt(testing_data_clear['cont6'])
testing_data_clear['cont7'] = np.sqrt(testing_data_clear['cont7'])
testing_data_clear.skew()

In [None]:
testing_data_clear = pd.get_dummies(testing_data_clear, drop_first=True)

In [None]:
testing_data_clear.head()

In [None]:
testing_data_clear.insert(loc=30,
          column='cat6_G',
          value=0)

In [None]:
testing_data_clear.columns

In [None]:
testing_data_scaled =scaler_x.transform(testing_data_clear)

In [None]:
testing_data_pred_sc = lr_model.predict(testing_data_scaled)

In [None]:
testing_data_pred_inv = scaler_y.inverse_transform(testing_data_pred_sc.reshape(-1,1))


In [None]:
testing_data_pred_inv

In [None]:
output = pd.DataFrame()
output['id']=testing_data.id
output['target']=testing_data_pred_inv
output.to_csv('submission.csv')
output.set_index('id').to_csv('submission.csv')
