In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from statsmodels.api import OLS
from sklearn.tree import plot_tree

import seaborn as sns

from statsmodels.stats.outliers_influence import variance_inflation_factor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
display(train_df)

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
display(test_df)

In [None]:
display(train_df.describe())
display(test_df.describe())
display(train_df.info())

In [None]:
train_df_corr = train_df.corr()

sns.set(rc={'figure.figsize':(30,20)})
sns.heatmap(train_df_corr, annot=True, annot_kws={'size':12})

In [None]:
train_df_corr.sort_values(['target'], ascending=False).loc[:, 'target']

## Distribution Plot

In [None]:
def plot_distribution(y_size=10, x_size=2):
    sns.set(rc={'figure.figsize':(10,40)})
    fig, axs = plt.subplots(y_size, x_size)

    y = 0
    for i, c in enumerate(train_df.columns):
        x = i % 2
        sns.distplot(train_df[c], ax=axs[y,x])
        if x == 1:
            y = y + 1

In [None]:
def plot_scatter(target, y_size=10, x_size=2):
    sns.set(rc={'figure.figsize':(10,40)})
    fig, axs = plt.subplots(y_size, x_size)

    y = 0
    for i, c in enumerate(train_df.columns):
        x = i % 2
        p = train_df[c]
        sns.scatterplot(data=train_df, x=c, y=target, ax=axs[y,x])
        if x == 1:
            y = y + 1

In [None]:
y = train_df.pop('target')
y

In [None]:
plot_distribution()

In [None]:
plot_scatter(target=y)

In [None]:
##train_df['cont_6_12_mean'] = train_df.loc[:, ['cont6','cont12']].mean(axis=1)
#train_df['cont_5_7_13_mean'] = train_df.loc[:, ['cont5','cont7','cont13']].mean(axis=1)
#train_df['cont_3_4_14_mean'] = train_df.loc[:, ['cont3','cont4','cont14']].mean(axis=1)
#train_df['cont_mean'] = train_df.loc[:, ['cont10','cont1','cont11','cont9','cont2','cont8','cont6','cont12']].mean(axis=1)
                                          
##test_df['cont_6_12_mean'] = test_df.loc[:, ['cont6','cont12']].mean(axis=1)
#test_df['cont_5_7_13_mean'] = test_df.loc[:, ['cont5','cont7','cont13']].mean(axis=1)
#test_df['cont_3_4_14_mean'] = test_df.loc[:, ['cont3','cont4','cont14']].mean(axis=1)
#test_df['cont_mean'] = test_df.loc[:, ['cont10','cont1','cont11','cont9','cont2','cont8','cont6','cont12']].mean(axis=1)

In [None]:
train_df = train_df.add(.01).pow(2)
test_df = test_df.add(.01).pow(2)

In [None]:
from scipy.stats import boxcox

for c in train_df.columns:
    boxcox(train_df[c])

for c in test_df.columns:    
    boxcox(test_df[c])

In [None]:
#drop_features = ['id','cont1','cont3','cont6','cont7','cont9','cont10','cont11','cont12','cont13','cont14']
#drop_features = ['id','cont6','cont12','cont10','cont1','cont11','cont13','cont9','cont7','cont2','cont8','cont5','cont3','cont4','cont14']
drop_features = ['id','cont6','cont1','cont7', 'cont9', 'cont10','cont11','cont12','cont13']
train_df.drop(columns=drop_features, inplace=True)
test_df.drop(columns=drop_features, inplace=True)


In [None]:
plot_distribution()

In [None]:
plot_scatter(target=y)

## Check the correlation of each variables

## Multicollinearity

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = train_df.columns

vif_data["VIF"] = [variance_inflation_factor(train_df.values, i) for i in range(len(train_df.columns))] 

display(vif_data)

In [None]:
#train_id = train_df.pop('id')
#test_id = test_df.pop('id')

#train_df['id'] = train_id
#train_df['means'] = train_df.mean(axis=1) 

#test_df['id'] = test_id
#test_df['means'] = test_df.mean(axis=1) 

In [None]:
submission = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv', index_col='id')
display(submission.head(100))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, train_size=0.3)

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit_transform(X_train)
ss.transform(X_test)

In [None]:
model_names = ["Decision Tree Regressor","Dummy Median", "Linear",  "Lasso", "Random Forest"]
models = [
    DecisionTreeRegressor(max_depth = 8, criterion='mae'),
    DummyRegressor(strategy='median'),
    LinearRegression(),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=128, n_jobs=-1)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

In [None]:
model = RandomForestRegressor(n_estimators=128, n_jobs=-1)
model.fit(X_train, y_train)
submission['target'] = model.predict(test_df)
submission.to_csv('random_forest.csv')
display(submission.tail())

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
submission['target'] = model.predict(test_df)
submission.to_csv('linear.csv')
display(submission.tail())

In [None]:
dtr = DecisionTreeRegressor()
params = {'max_depth':[i for i in range(2, 128)]}
grid = GridSearchCV(dtr, param_grid=params, cv=10)
grid.fit(X_train, y_train)

submission['target'] = grid.predict(test_df)
submission.to_csv('decision_tree_regressor.csv')
display(submission.tail())

#display(grid.score(X_test, y_test))
#predictions = grid.predict(X_test)
#mean_squared_error(y_test, predictions)

In [None]:
#dtr.fit(X_train, y_train)
#plt.figure(figsize=(5,5), dpi=1000)
#plot_tree(dtr, feature_names=X_train.columns)

In [None]:
model = OLS(y_train, X_train).fit()
predictions = model.predict(X_test)

#submission['target'] = np.round(model.predict(test_df), 2)

display(model.summary())
display(submission.tail())
#submission.to_csv('ols.csv')
