# Tabular Playground Series - Jan 2021

**Regression** helps determine relationships among variables and to predict results using a new set of predictors.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

In [None]:
import seaborn as sns
from sklearn.linear_model import ElasticNet

# Read the data   
There are 300,000 rows and 15 columns.   

In [None]:
submission = pd.read_csv(input_path / 'sample_submission.csv')
display(submission.head(3))

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
display(train)

In [None]:
#visualize possible correlations
corr1=train.corr()
sns.heatmap(corr1, cmap="cubehelix", annot=True)

In [None]:
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head(3))

In [None]:
#visualize possible correlations
corr2=train.corr()
sns.heatmap(corr2, cmap="magma", annot=True)

In [None]:
train.columns

In [None]:
#plotting the test CORRELATION df
sns.lmplot('cont13', 'cont3', data=corr2,  logistic=False, markers=["*"])

In [None]:
#plotting the train CORRELATION density
corr1.plot(kind='density', subplots=True, layout=(15,1), sharex=True, figsize=(10,10))
plt.show()

# Stater Notebook naive model: Slightly modified   
## 1. Pull out the target, and make a validation split   

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.50)

In [None]:
# Get benchmark score
model_dummy = DummyRegressor(strategy='median')
model_dummy.fit(X_train, y_train)
y_dummy = model_dummy.predict(X_test)
score_dummy = mean_squared_error(y_test, y_dummy, squared=False)
print(f'{score_dummy:0.5f}') 

#decreased size from .60 to .50 increased the scores slightly

## 2.Simple Linear Regression

In [None]:
# Simple Linear Regression
model_simple_linear = LinearRegression(fit_intercept=False) # data is not centered, don't fit intercept
model_simple_linear.fit(X_train, y_train)
y_simple_linear = model_simple_linear.predict(X_test)
score_simple_linear = mean_squared_error(y_test, y_simple_linear, squared=False)
print(f'{score_simple_linear:0.5f}')

In [None]:
#automated model, visualization, regression
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot], color="dimgray")
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=14)
    plt.show()

model_names = ["Dummy Median", "Linear",  "Lasso", "Random Forest"]

models = [
    DummyRegressor(strategy='median'),
    LinearRegression(fit_intercept=False),
    Lasso(fit_intercept=False),
    RandomForestRegressor(n_estimators=50, n_jobs=-1)]

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

## 3- Random Forest model training on all the data and edited Stater Notebook submission

In [None]:
model = RandomForestRegressor(n_estimators=50, n_jobs=-1)
model.fit(train, target)
submission['target'] = model.predict(test)
submission.to_csv('random_forest.csv')

# Other models: Ridge and ElasticNet

In [None]:
#automated models: other models
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot], color="goldenrod")
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=16)
    plt.show()

model_names = ["Ridge Regression", "ElasticNet"]

models = [Ridge(alpha = 0.0001),    
    ElasticNet(alpha=1)]    

for name, model in zip(model_names, models):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

## A new model

In [None]:
from sklearn.linear_model import ElasticNetCV
#changed train size

X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.8)

#increased size from .50 to .80 increased the score slightly

In [None]:
#automated and visualized
def plot_results(name, y, yhat, num_to_plot=10000, lims=(0,12), figsize=(6,6)):
    plt.figure(figsize=figsize)
    score = mean_squared_error(y, yhat, squared=False)
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot], color="darkgreen")
    plt.plot(lims, lims)
    plt.ylim(lims)
    plt.xlim(lims)
    plt.title(f'{name}: {score:0.5f}', fontsize=14)
    plt.show()

model_name = ["ElasticNet"]

model = [ElasticNet(alpha=1, normalize=False)]

for name, model in zip(model_name, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_results(name, y_test, y_pred)

In [None]:
model = ElasticNet(alpha=1, normalize=False)
model.fit(train, target)
submission['target1'] = model.predict(test)
submission.to_csv('ElasticNet.csv')

In [None]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1

In [None]:
#prepare for final submission
#make index a column
df1.reset_index(inplace=True)

In [None]:
df2=df1[['id','Predicted']]
df2.to_csv('MySubmission.csv')