### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

### Reading data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-feb-2021/train.csv')
train.head()

In [None]:
# text dataset
test = pd.read_csv('../input/tabular-playground-series-feb-2021/test.csv')
test.head()

In [None]:
# submission file
submit = pd.read_csv('../input/tabular-playground-series-feb-2021/sample_submission.csv')
submit.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
submit.info()

In [None]:
train.duplicated().sum()

In [None]:
train.describe()

### Data Correlation

In [None]:
train.corr()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(train.corr(), annot=True)

In [None]:
feature_corr = train.corr().unstack().sort_values()
feature_corr

In [None]:
feature_corr[(feature_corr>0.6)&(feature_corr<1)]

In [None]:
#Show Highly Correlated Features > 0.6
print(feature_corr[(abs(feature_corr)>0.6) & (abs(feature_corr)<1)].drop_duplicates())

In [None]:
high_corr_df = pd.DataFrame(feature_corr[(abs(feature_corr)>0.6) & (abs(feature_corr)<1)].drop_duplicates())
high_corr_df.index

### Features Distribution

In [None]:
num_feature = train.columns[train.dtypes!='object']

def my_plot(feature):
    plt.hist(train[feature])
    plt.title(feature)
    plt.show()
        
for i in num_feature:
    my_plot(i)

### Categorical Features Transformation

In [None]:
train_cat = train.select_dtypes(include=['object'])
train_cat 

In [None]:
train_clear = pd.get_dummies(train_cat, drop_first=True)
train_clear

In [None]:
train_clear = pd.get_dummies(train, drop_first=True)
train_clear

In [None]:
train_clear.columns

In [None]:
train['cat6'].unique()

In [None]:
train_clear.info()

In [None]:
#
test_cat = test.select_dtypes(include=['object'])
test_cat 

In [None]:
#
test_clear = pd.get_dummies(test, drop_first=True)
test_clear

In [None]:
test_clear.info()

In [None]:
test_clear.columns

In [None]:
test['cat6'].unique()

In [None]:
train_clear=train_clear.drop(['cat6_G'], axis=1)
train_clear

### Divide Data into Train & Test

In [None]:
x = train_clear.drop('target', axis=1)
y = pd.DataFrame(train_clear['target'])

In [None]:
x

In [None]:
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
x_train.shape, y_train.shape

In [None]:
x_test.shape, y_test.shape

### Numerical Features Scaling

In [None]:
#Normalization: (X-X.min())/(X.max()-X.min())
#Standardization: (X-X.mean())/X.std()

scaler_x = MinMaxScaler().fit(x_train)
scaler_y = MinMaxScaler().fit(y_train)

In [None]:
x_train_sc = scaler_x.transform(x_train)
x_test_sc = scaler_x.transform(x_test)

In [None]:
y_train_sc = scaler_y.transform(y_train)
y_test_sc = scaler_y.transform(y_test)

In [None]:
#
scaler_x_test = MinMaxScaler().fit(test_clear)
x_tested_sc = scaler_x_test.transform(test_clear)

### Apply Linear Regression

In [None]:
lr_model = LinearRegression()
lr_model.fit(x_train_sc, y_train_sc)
y_pred_sc = lr_model.predict(x_test_sc)

In [None]:
mae = mean_absolute_error(y_test_sc, y_pred_sc)
rmse = np.sqrt(mean_squared_error(y_test_sc, y_pred_sc))

print('MAE = ', mae.round(4))
print('RMSE = ', rmse.round(4))

In [None]:
y_test_inv = scaler_y.inverse_transform(y_test_sc.reshape(-1,1))
y_pred_inv = scaler_y.inverse_transform(y_pred_sc.reshape(-1,1))

actual_mae = mean_absolute_error(y_test_inv, y_pred_inv)
actual_rmse = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))

print('Actual MAE = ', int(actual_mae))
print('Actual RMSE = ', int(actual_rmse))

In [None]:
#
y_pred_sc = lr_model.predict(x_tested_sc)
y_pred_inv = scaler_y.inverse_transform(y_pred_sc.reshape(-1,1))
y_pred_inv

### Final output

In [None]:
output = pd.DataFrame()
output.index=test_clear.id
output['target']=y_pred_inv
output

In [None]:
output.to_csv('./sample_submission_trial1.csv')

In [None]:
trial = pd.read_csv('./sample_submission_trial1.csv')
trial.head()