Starting this notebook to learn a lot and try new things....

Import libraries

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

Shape of data

In [None]:
print('Shape of train data is {}'.format(train.shape))

# Exploratory Analysis

Finding correlation between the variables

In [None]:
sns.set(style='whitegrid')

#Generating a random dataset 
d = pd.DataFrame(data = np.random.normal(size=(100,26)) )

#computing correlation matrix
corr = d.corr()

# Generate a mask for upper triangle 
mask = np.triu(np.ones_like(corr, dtype = bool))

#subplots figure 
f,ax = plt.subplots(figsize = (11,9))

#Generate a customer diverging colormap
cmap = sns.diverging_palette(230,20,as_cmap=True)

#Heat map with the mask 
sns.heatmap(corr, mask = mask , cmap = cmap , vmax=0.3, center = 0  , 
           square = True, linewidth = 1 , cbar_kws = {"shrink":.5})

Above correlation plot shows, not much correlation between the variables. This will reduce the bias in the target variable

## Pull out the target, and make a validation split

Splitting train and test in 60:40 ratio

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

# How well can we do with a completely naive model?

We'll want any of our models to do (hopefully much!) better than this.

In [None]:
# Let's get a benchmark score
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy)
print(f'{score_dummy:0.5f}') # 0.54118

# Simple Linear Regression

A simple linear regression doesn't do better than our dummy regressor!

In [None]:
# Simple Linear Regression
model_simple_linear = LinearRegression(fit_intercept=False) # data is not centered, don't fit intercept
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear)
print(f'{score_simple_linear:0.5f}')

# This seems slow and repetative. Can we automate it a bit?

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=False),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=50, n_jobs=-1)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

# RandomForest 

In [None]:
model = RandomForestRegressor(n_estimators=50, n_jobs=-1)
model.fit(train, target)
submission['target'] = model.predict(test)
submission.to_csv('random_forest.csv')