# Capstone Project - Google Predictive Analytics

## Part_7: Modeling_Comparison

### Comparing models listed below and their perfomances
- LinearRegression
- DecisionTreeRegressor
- ExtraTreesRegressor
- RandomForestRegressor
- AdaBoostRegressor
- GradientBoostingRegressor
- Simple Vector Machine 

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
df = pd.read_csv('../capstone_data/kaggle_data/df_dummies_tables.csv')

In [4]:
df.shape

(903653, 506)

#### Train_test_split and model with various Regression

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [7]:
ss = StandardScaler()
ss.fit(df)

X_s = ss.transform(df)

X_s

array([[-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
        -0.40559985, -0.03229096],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
        -0.40559985, -0.03229096],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
        -0.40559985, -0.03229096],
       ...,
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
         2.44137601, -0.03229096],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
         2.58372481, -0.03229096],
       [-0.01152441, -0.1359686 , -0.4336323 , ...,  0.5341481 ,
         3.86486395, -0.03229096]])

In [8]:
y = df['transactionRevenue']
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: transactionRevenue, dtype: float64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_s, y, random_state=42)

In [10]:
lr = LinearRegression()
dtr = DecisionTreeRegressor()
etr = ExtraTreesRegressor()
rfr = RandomForestRegressor()
abr = AdaBoostRegressor()
gbr = GradientBoostingRegressor()
svr = SVR()

list_of_models = [lr, dtr, etr, rfr, abr, gbr, svr]

In [None]:
for each in list_of_models:
    print(str(each).split('(')[0])
    each.fit(X_train, y_train)
    print('Training score:')
    print(each.score(X_train, y_train))
    print('Testing score:')
    print(each.score(X_test, y_test))
    print()

LinearRegression
Training score:
1.0
Testing score:
-220.0415012211023

DecisionTreeRegressor
Training score:
0.9999999999991275
Testing score:
0.9504939651705513

ExtraTreesRegressor
Training score:
0.999999999998394
Testing score:
0.9991338361395896

RandomForestRegressor
Training score:
0.9770270964310994
Testing score:
0.9732557033553639

AdaBoostRegressor
Training score:
0.9494835426723149
Testing score:
0.8933452358517872

GradientBoostingRegressor
Training score:
0.9999393721053543
Testing score:
0.9759920054035719

SVR


In [None]:
gbr.fit(X_train, y_train)

In [None]:
gbr.score(X_train, y_train), gbr.score(X_test, y_test)

In [None]:
from sklearn.metrics import mean_squared_error


def str_from_model(model):
    return str(model).split('(')[0]

for each in list_of_models:
    print('>', str_from_model(each))
    print('Training RMSE:')
    print(mean_squared_error(y_train, each.predict(X_train)) ** 0.5)
    print (mean_squared_error(y_test, each.predict(X_test)) ** 0.5)
    print('\n**********\n')

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
print(mean_squared_error(y_train, gbr.predict(X_train)) ** 0.5)
print (mean_squared_error(y_test, gbr.predict(X_test)) ** 0.5)  