In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-jan-2021/sample_submission.csv',index_col='id') 

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
test.info

In [None]:
sub.head()

In [None]:
q1 = train.quantile(0.25)
q3 = train.quantile(0.75)
iqr = q3 - q1

# Selection
mask = (train >= (q1 - 1.5*iqr)) & (train <= q3 + 1.5*iqr)
train = train[mask.apply(all, axis=1)]

In [None]:
train.info

In [None]:
target = train.pop('target')

In [None]:
target.head()

In [None]:
train.head()

In [None]:
sub.head()

In [None]:
sub['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.60)

In [None]:
X_train.info

In [None]:
X_test.info

In [None]:
y_train

In [None]:
y_test

In [None]:
#Plot for understanding

def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=18)
    plt.show()

In [None]:
# Dummy Regressor
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
print(f'{score_dummy:0.5f}') # 0.54118

In [None]:
  plot_results("Dummy", y_test, y_dummy)

In [None]:
# Linear Regression
model_simple_linear = LinearRegression(fit_intercept=False) # data is not centered, don't fit intercept
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
print(f'{score_simple_linear:0.5f}')

In [None]:
 plot_results("Linear", y_test, y_simple_linear)

In [None]:
# Lasso
model_simple_lasso =  Lasso(fit_intercept=False)
model_simple_lasso.fit(X_train, y_train)
y_simple_lasso = model_simple_lasso.predict(X_test)
score_simple_lasso = mean_squared_error(y_test, y_simple_lasso, squared=False)
print(f'{score_simple_lasso:0.5f}')

In [None]:
 plot_results("Lasso", y_test, y_simple_lasso)

In [None]:
# Rainforest
model_simple_randforest =   RandomForestRegressor(n_estimators=50, n_jobs=-1)
model_simple_randforest.fit(X_train, y_train)
y_simple_randforest = model_simple_randforest.predict(X_test)
score_simple_randforest = mean_squared_error(y_test, y_simple_randforest, squared=False)
print(f'{score_simple_randforest:0.5f}')

In [None]:
plot_results("Rainforest", y_test, y_simple_randforest)

In [None]:
# Good for now

model = RandomForestRegressor(n_estimators=50, n_jobs=-1)
model.fit(train, target)
sub['target'] = model.predict(test)
sub.to_csv('random_forest.csv')

In [None]:
sub.head()