In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)
%matplotlib inline
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
#path = '/kaggle/input/tabular-playground-series-jan-2021/'
path = Path('/kaggle/input/tabular-playground-series-jan-2021/')
train = pd.read_csv(path / 'train.csv', index_col='id')
test = pd.read_csv(path / 'test.csv', index_col='id')
sample = pd.read_csv(path / 'sample_submission.csv', index_col='id')

In [None]:
print("Train Shape {} \n Test Shape {}".format(train.shape,test.shape))

In [None]:
train.head()

In [None]:
train.info()

In [None]:
#p = sns.pairplot(train)

In [None]:
corr_matrix = train.corr()
corr_matrix['target']

In [None]:
plt.figure(figsize=(7,7))
#sns.color_palette("Blues", as_cmap=True)
sns.heatmap(corr_matrix,
            vmin=-1,
            cmap='Set2');

In [None]:
dims = []

#for col in list(train.columns):
#    d = dict(label = col, values=train[col])
#    dims.append(d)

#fig = go.Figure(data=go.Splom(dimensions=dims,
#                             showupperhalf=False,
#                             diagonal_visible=False))

#fig.update_layout(
#    title='Pairplot',
#    xaxis=dict(tickangle = 45),
#    yaxis=dict(tickangle = 45),    
#    width=800,
#    height=600,
#)

#iplot(fig)

In [None]:
import plotly.express as px
fig = px.histogram(train, x="target")
iplot(fig)

In [None]:
train['log_target'] = np.log(train['target'])
fig = px.histogram(train, x="log_target")
iplot(fig)

In [None]:
fig = px.histogram(train, x="cont1")
iplot(fig)

In [None]:
train[['target']].max()

## Model Pipeline

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn import set_config
import math
X = train.drop(["target","log_target"], axis = 1) # train_data will feed to the model
y = train['target'] # label to predict

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = list(X_train.columns)

In [None]:
def build_model(model):
    numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)])
    regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regression_model', model)])   
    set_config(display='diagram')
    return regr

def get_pipeline():
    numerical_pipe = Pipeline([('std_scaler',StandardScaler())])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features)])
    return preprocessor

def calculate_train_rmse(name, model):
    preds = model.predict(X_train)
    mse = mean_squared_error(y_train, preds)
    rmse = np.sqrt(mse)
    print("Training RMSE of {} : {}".format(name,rmse))

def sample_prediction(name, model, num_records):
    some_data = X.iloc[:num_records]
    some_labels = y.iloc[:num_records]
    preds = []
    for label in list(model.predict(some_data)):
        preds.append(math.floor(label))

    print("Predictions on training data using :", name)    
    print("Predictions    :", preds)
    print("Actual labels  :", list(some_labels)) 

In [None]:
linear_reg = build_model(LinearRegression())
linear_reg.fit(X_train,y_train)

In [None]:
calculate_train_rmse("LinearRegression",linear_reg)

In [None]:
sample_prediction("LinearRegression",linear_reg, 10)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = build_model(DecisionTreeRegressor())
tree_reg.fit(X_train,y_train)

In [None]:
calculate_train_rmse("DecisionTreeRegressor",tree_reg)
sample_prediction("DecisionTreeRegressor", tree_reg, 10)

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = build_model(RandomForestRegressor(random_state = 42))
forest_reg.fit(X_train,y_train)


In [None]:
calculate_train_rmse("RandomForestRegressor",forest_reg)
sample_prediction("RandomForestRegressor", forest_reg, 10)

In [None]:
final_predictions = forest_reg.predict(X_test)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)
print("Test RMSE for RandomForestRegressor : ", final_rmse)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'regression_model__n_estimators': [3, 10, 30, 50], 'regression_model__max_features' : [2, 4, 6, 8, 10, 12]},
    {'regression_model__bootstrap': [False], 'regression_model__n_estimators' : [3, 10], 'regression_model__max_features' : [2, 3, 4] }
]

grid_search = GridSearchCV(forest_reg, param_grid, cv=5, 
                          scoring = 'neg_mean_squared_error',
                          return_train_score=True)
grid_search.fit(X, y)


In [None]:
grid_search.best_params_

In [None]:
set_config(display='diagram')
grid_search.best_estimator_

In [None]:


cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score),params)



In [None]:
final_model = grid_search.best_estimator_
preprocessor = get_pipeline()

final_predictions = final_model.predict(X_test)
final_mse = mean_squared_error(y_test,final_predictions)
final_rmse = np.sqrt(final_mse)
print("Test RMSE for RandomForestRegressor : ", final_rmse)

In [None]:
sample['target'] = forest_reg.predict(test)
sample.to_csv('random_forest.csv')

In [None]:
#np.isfinite(X).all()

In [None]:
import joblib as jbl
jbl.dump(forest_reg, "forest_reg.pkl")