#  JAN 2021

In [None]:
# Important libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
from lightgbm import LGBMRegressor
from xgboost.sklearn import XGBRegressor

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")

In [None]:
train.head(3)

In [None]:
(f' Thw train file contains {train.shape[0]} data rows and {train.shape[1]-2} features and one row id and a target column ')

In [None]:
train.info()

In [None]:
train.isna().sum()

In [None]:
train.describe().T

### EDA

In [None]:
f, ax = plt.subplots(nrows=14, ncols=3, figsize=(12, 28))
for i, var in enumerate(train.columns[train.columns.str.startswith('cont')]):
    sns.distplot(train[var], ax=ax[i, 0])
    sns.boxplot(train[var], ax=ax[i, 1])
    stats.probplot(train[var], plot=ax[i, 2])
plt.tight_layout()
plt.show()

The features have multimodel distribution,

and two features i.e. cont7 and cont9 constis of outliers

In [None]:
f, ax = plt.subplots(figsize=(15,10))
sns.heatmap(train[train.columns[train.columns != 'id']].corr(), annot = True)

plt.title("Correlation Matrix", fontsize=16)
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(14) 
    tick.label.set_rotation(90) 
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(14)
    tick.label.set_rotation(0) 
plt.show()

No feature is strongly correlated,

and perticularly with target none have significant correlation

## Treating outliers 

In [None]:
#Checking for mild outliers (1.5 * inter_quantile_range)
Q1_train = train.quantile(0.25)
Q3_train = train.quantile(0.75)
IQR_train = Q3_train - Q1_train

((train < Q1_train - 1.5*IQR_train) | (train > Q3_train + 1.5*IQR_train)).agg([sum, 'mean', 'count'])

In [None]:
#Checking for extreme outliers (3 * interquantilerange)
Q1_train = train.quantile(0.25)
Q3_train = train.quantile(0.75)
IQR_train = Q3_train - Q1_train
#
((train < Q1_train - 3*IQR_train) | (train > Q3_train + 3*IQR_train)).agg([sum, 'mean', 'count'])

The Target Feature has some extreme outliers and 'cont7', 'cont10' has some mild outliers.

Let's remove the records having target feature outlier and replace the outliers in 'cont7' and 'cont10' iretative imputing value.



In [None]:
# droping extrem outliers from the database 
train = train.drop(train[(train['target'] < (Q1_train - 3*IQR_train)['target']) | 
                         (train['target'] > (Q1_train + 3*IQR_train)['target'])].index)

In [None]:
# repelacing weak outliers with NAN
def replace_outliers(data):
    for col in data.columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        none = np.nan
      
        data.loc[((data[col] < Q1 - 1.5*IQR) | (data[col] > Q3 + 1.5*IQR)), col] = none
    return data

train[train.drop('target', axis = 1).columns] = replace_outliers(train.drop('target', axis = 1))

In [None]:
# checking none
train.isna().sum()

In [None]:
# imputing NAN using regressing from others features
impute = IterativeImputer(missing_values= np.nan,random_state=0)
impute.fit(train)
t = impute.transform(train)

In [None]:
# converting back into database
train1 = pd.DataFrame(t)    
train1.columns = train.columns

train = train1

In [None]:
#Checking  again for mild outliers (1.5 * inter_quantile_range)
Q1_train = train.quantile(0.25)
Q3_train = train.quantile(0.75)
IQR_train = Q3_train - Q1_train

((train < Q1_train - 1.5*IQR_train) | (train > Q3_train + 1.5*IQR_train)).agg([sum, 'mean', 'count'])

### Baseline Model

In [None]:
train = train.reset_index()
train.shape

In [None]:
y = train['target']
X = train.drop(['target', 'id','index'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=40)

In [None]:
dummy_m = DummyRegressor(strategy="median")
dummy_m.fit(X_train, y_train)
y_dummy = dummy_m.predict(X_test)

dummy_score = mean_squared_error(y_test,y_dummy)

print(f'MSE of dummy or baseline model is {dummy_score:0.3f}')

### Linear Models

In [None]:
models_name = ["Linear",'Lasoo', 'Ridge','ElasticNet','Decision Tree']

models = [LinearRegression(),
         Lasso(),
         Ridge(),
         ElasticNet(),
         DecisionTreeRegressor(max_depth=5)
]

for name, model in zip(models_name, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test,y_pred)
    print(f'{name}: RMSE: {np.sqrt(mse):0.5f}')

## Ensembling Models 

In [None]:
model_names = [ "Random Forest", "XGBoost", "LGBM"]

models = [
    LGBMRegressor(),
    RandomForestRegressor(n_estimators = 10, max_depth = 10),
    XGBRegressor()]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_squared_error(y_test, y_pred)
    print(f'{name}: RMSE: {np.sqrt(score)}')


Woola all the ensembeling model are doing better and LightGBM is best model!

### Submission

In [None]:
test = pd.read_csv('../input/tabular-playground-series-jan-2021/test.csv')
test.head()

In [None]:
sample = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
sample.head()

In [None]:
sample['target'] = model.predict(test.drop('id', axis = 1, errors = 'ignore'))
sample.to_csv('submission.csv', index = False)
sample.head()

**The End!**


***Thank you for reading this notebook.***


