In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train.head())

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

In [None]:
train.head()

In [None]:
# Check the shape of the dataset
train.shape

Train data contains 300000 rows, 14 columns.

In [None]:
# Check for null values  in train data
train.isnull().sum()

The dataset has no null values to deal with. 

In [None]:
# Check for duplicates
train.duplicated().sum()

No duplicates found in the dataset to deal with.

In [None]:
# check for null values in the test data
test.isnull().sum()

The test data contains no missing values

In [None]:
test.shape

In [None]:
test.duplicated().sum()

Test data contains 200000 rows in 14 columns

In [None]:
train.info()

Train data contains no categorical variables and only real numbers.

In [None]:
# Statistical information on train data set
train.describe().T

# Glance at the distribution of variables in train and test set.

In [None]:
figure, ax = plt.subplots(7,2,figsize=(15,30))
c=1
for i in train.drop(['target'],axis=1).columns:
    plt.subplot(7,2,c)
    sns.distplot(train[i],color = 'blue', label='train')
    sns.distplot(test[i],color = 'red', label='test')
    c=c+1
    plt.xlabel(i, fontsize=9)
    plt.legend()
plt.show()

##### The data falls in a multimodal distribution. Multimodal distribution indicates the population is not normally distributed and that sample has several patterns of response or extreme views, preferences or attitudes.

In [None]:
# visualize the relationship between variables
g = sns.PairGrid(train)
g.map(sns.scatterplot)


###### The data has no linear correlation between vaiables. This leads us to see if there is even a correlation between variables.

In [None]:
# visualize the correlation between varibles
plt.figure(figsize=(25,25))
sns.heatmap(train.corr(), annot=True, cmap = 'mako')

##### A few of the variables shown in dark blue are correlated. Most of the independent variable do not show any correlation with the target variable. This leads us to think if there are outliers in the data.

In [None]:
# Find if there are outliers in the data
plt.figure(figsize=(20,10))
sns.boxplot(data=train.drop(['target'],axis=1))
plt.title('The boxplot to study outliers')
plt.xlabel('Variables that predict the Target')
plt.ylabel('Values')

Except in cont7 there are no outliers in the data. In some of the samples, such as cont2, cont4, cont13, cont14 the data is more dispersed. In cont3, cont7, cont9 the datta is less dispersed. Both right skewed and left skewed data distribution is seen.

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

# How well can we do with a completely naive model?

We'll want any of our models to do (hopefully much!) better than this.

In [None]:
# Let's get a benchmark score
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
print(f'{score_dummy:0.5f}') # 0.54118

# Simple Linear Regression

A simple linear regression doesn't do better than our dummy regressor!

In [None]:
# Simple Linear Regression
model_simple_linear = LinearRegression(fit_intercept=True) # data is not centered, fit intercept
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
print(f'{score_simple_linear:0.5f}')

# Automate the process

In [None]:
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

## Implementing different ML models


DummyClassifier is a classifier that makes predictions using simple rules.This classifier is useful as a simple baseline to compare with other (real) classifiers. We do not use it for real problems.
Linear regression  is the most basic form, where the model is not penalized for its choice of weights, at all. ... Lasso is a modification of linear regression, where the model is penalized for the sum of absolute values of the weights.

In [None]:
model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=True),
    Lasso(fit_intercept=True),
    RandomForestRegressor(n_estimators=50, n_jobs=-1)]


for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

# It looks like RandomForest did the best. Let's train it on all the data and make a submission!

In [None]:
model = RandomForestRegressor(n_estimators=50, n_jobs=-1)
model.fit(train, target)
submission['target'] = model.predict(test)
submission.to_csv('random_forest.csv')