# Tabular Playground Series - Feb 2021

This notebook presents starter EDA and trains a RandomForestRegressor

In [None]:
import numpy as np
import pandas as pd
import os
import plotly.graph_objects as go
import plotly.express as px
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.figure_factory as ff
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Load Data

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv', index_col='id')
sample = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv', index_col='id')

In [None]:
print("Train shape :",train.shape, "\n  Test shape:", test.shape, "\nSample shape:", sample.shape)

In [None]:
pd.set_option('display.max_columns',None)
train.head()

In [None]:
test.head()

# Any missing values? - NO

In [None]:
train.info()

In [None]:
import missingno as msno 
misshing_info = msno.bar(train)
misshing_info.set_title('Training data missing values chart',fontdict={'fontsize':25})

In [None]:
test_missing_info = msno.bar(test)
test_missing_info.set_title('Test data missing values chart',fontdict={'fontsize':25})

# Explore the target variable
> We are dealing with bimodal distribution with outliers !

In [None]:
fig = px.histogram(train, x="target",marginal="box",color_discrete_sequence=['forestgreen'])
fig.update_layout(showlegend=True,
                  title = { 'text' : '<b>Distribution of target variable<b>',
                          'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'
                        },
                  title_font_color='black',
                  yaxis={"title": "Count",
                         "zeroline":False, "showgrid":False,
                         "fixedrange": False
                        },
                  plot_bgcolor="#ffffff",
                  margin={"r":20, "l":30},
                 )
iplot(fig)


# Explore Categorical Attribute
> There are 10 categorical attributes cat0 - cat9

## Value Counts

In [None]:
cat_cols = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8','cat9']

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=5, cols=2,subplot_titles=cat_cols,shared_yaxes=True)


col = 0
for i in range(0,2):
    col = i+1
    cat=train[cat_cols[i]].value_counts().reset_index()
    fig.add_trace(
        go.Bar(
                x=cat['index'], y=cat[cat_cols[i]],
                text=cat[cat_cols[i]],
                textposition='auto',
                name=cat_cols[i]
            ),
        row=1, col=col
    )
col = 0    
for i in range(2,4):
    col = col+1
    cat=train[cat_cols[i]].value_counts().reset_index()
    fig.add_trace(
        go.Bar(
                x=cat['index'], y=cat[cat_cols[i]],
                text=cat[cat_cols[i]],
                textposition='auto',
                name=cat_cols[i]
            ),
        row=2, col=col
    )

col = 0    
for i in range(4,6):
    col = col+1
    cat=train[cat_cols[i]].value_counts().reset_index()
    fig.add_trace(
        go.Bar(
                x=cat['index'], y=cat[cat_cols[i]],
                text=cat[cat_cols[i]],
                textposition='auto',
                name=cat_cols[i]
            ),
        row=3, col=col
    )
col = 0    
for i in range(6,8):
    col = col+1
    cat=train[cat_cols[i]].value_counts().reset_index()
    fig.add_trace(
        go.Bar(
                x=cat['index'], y=cat[cat_cols[i]],
                text=cat[cat_cols[i]],
                textposition='auto',
                name=cat_cols[i]
            ),
        row=4, col=col
    )    
col = 0    
for i in range(8,10):
    col = col+1
    cat=train[cat_cols[i]].value_counts().reset_index()
    fig.add_trace(
        go.Bar(
                x=cat['index'], y=cat[cat_cols[i]],
                text=cat[cat_cols[i]],
                textposition='auto',
                name=cat_cols[i]
            ),
        row=5, col=col
    )     
fig.update_layout(showlegend=True,
                  title = { 'text' : '<b>Categorical column value counts<b>',
                          'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'
                        },
                  title_font_color='black',
                  yaxis={"title": "Count",
                         "zeroline":False, "showgrid":False,
                         "fixedrange": False
                        },
                  plot_bgcolor="#ffffff",
                  margin={"r":20, "l":30},
                 )
iplot(fig)

* cat0 - cat5 has upto 4 categories
* cat6 - cat 9 has upt 8 categories

### Lets zoom in and look at each category

In [None]:
fig, axes = plt.subplots(5, 2, sharey=True, figsize=(8, 15))
sns.set_theme(palette="spring_r",style="ticks")
for i, ax in zip(range(10), axes.flat):
    sub_plot = sns.boxplot(x="cat{}".format(i), y="target",
            data=train,  ax=ax)
    if (i % 2) != 0:
        sub_plot.yaxis.set_visible(False) 
    sub_plot.set_xlabel("cat{}".format(i),fontsize=18)
    sub_plot.set_ylabel("target",fontsize=18)

sns.despine(offset=5, trim=True)
sns.despine(left=True)     
fig.tight_layout(pad=3.0)
fig.suptitle('Distribution of target per category across categorical attributes', fontsize=16)
fig.subplots_adjust(top=0.95)
plt.show()

# Explore Numerical Attributes

There are 14 numerical attributes cont0 - cont13

In [None]:
numeric_cols = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6','cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']

In [None]:
fig = make_subplots(rows=2, cols=7,subplot_titles=numeric_cols, shared_yaxes=True)

col = 0
for i in range(0,7):
    col = col+1
    fig.add_trace(go.Box(y=train[numeric_cols[i]], name=numeric_cols[i],
                    marker_color = 'indianred'),
                     row=1, col=col)
col = 0
for i in range(7,14):
    col = col+1
    fig.add_trace(go.Box(y=train[numeric_cols[i]], name=numeric_cols[i],
                    marker_color = 'indianred'),
                     row=2, col=col)
    
fig.update_layout(showlegend=True,
                  title = { 'text' : '<b>Numerical column distribution<b>',
                          'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'
                        },
                  title_font_color='black',
                  yaxis={"title": "Count",
                         "zeroline":False, "showgrid":False,
                         "fixedrange": False
                        },
                  plot_bgcolor="#ffffff",
                  margin={"r":20, "l":30},
                 )
iplot(fig)

Columns with Outliers are 
* cont0
* cont2
* cont6
* cont8


In [None]:
numeric_cols = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6','cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13','target']
corr=train[numeric_cols].corr()

In [None]:
sns.set_theme(palette="mako",style="ticks")
fig, axes = plt.subplots(7, 2, sharey=True, figsize=(8, 15))
for i, ax in zip(range(14), axes.flat):
    sub_plot = sns.scatterplot(data=train, x="cont{}".format(i), y="target", ax=ax)
    if (i % 2) != 0:
        sub_plot.yaxis.set_visible(False) 
    sub_plot.set_xlabel("cont{}".format(i),fontsize=18)
    sub_plot.set_ylabel("target",fontsize=18)

sns.despine(offset=5, trim=True)
sns.despine(left=True)     
fig.tight_layout(pad=3.0)
fig.suptitle('Scatter plot of numerical attributes vs target', fontsize=16)
fig.subplots_adjust(top=0.95)
plt.show()

In [None]:
fig = px.imshow(corr)
fig.update_layout(showlegend=True,
                  title = { 'text' : '<b>Correlation Matrix of numerical attributes<b>',
                          'x':0.5,
                        'xanchor': 'center',
                        'yanchor': 'top'
                        },
                  title_font_color='black',
                 
                 )
iplot(fig)

>  No significant linear correlation 

# Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import set_config

numeric_cols = ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6','cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
X = train.drop("target", axis = 1)  
y = train['target'] # label to predict

def build_model(model):
    #numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    categorical_pipe = Pipeline([('one_hot',OneHotEncoder())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, cat_cols)])
    regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regression_model', model)])   
    set_config(display='diagram')
    return regr

def get_pipeline():
    #numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    categorical_pipe = Pipeline([('one_hot',OneHotEncoder())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_cols),
            ('cat', categorical_transformer, cat_cols)])
    return preprocessor

def calculate_train_rmse(name, model):
    runs_predictions = model.predict(X)
    mse = mean_squared_error(y, runs_predictions)
    rmse = np.sqrt(mse)
    print("Training RMSE of {} : {}".format(name,rmse))

def sample_prediction(name, model, num_records):
    some_data = X.iloc[:num_records]
    some_labels = y.iloc[:num_records]
    preds = []
    for label in list(model.predict(some_data)):
        preds.append(math.floor(label))

    print("Predictions on training data using :", name)    
    print("Predictions    :", preds)
    print("Actual labels  :", list(some_labels))    


# RandomForestRegressor
Let's train RandomForestRegressor base version with standard scaler and one-hot encoding. Nothing fancy here. Just an initial version

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = build_model(RandomForestRegressor(random_state = 42))
forest_reg.fit(X,y)

In [None]:
calculate_train_rmse("RandomForestRegressor",forest_reg)

## Prediction

In [None]:
sample['target'] = forest_reg.predict(test)


In [None]:
sample

In [None]:
sample.to_csv('random_forest_v1.csv')

In [None]:
import joblib as jbl
jbl.dump(forest_reg, "forest_reg.pkl")

### To be continued. Thanks for going through. Long way to go. Please upvote if you find it useful!