# Model selection

In this notebook we'll try different model and choose the best on cross-validation

List of models:
- Linear regression
- Random Forest
- XGBoost
- CatBoost

For this task we will use RMSE metric

In [173]:
import pandas as pd
import numpy as np
import sklearn.model_selection as model_selection
import pickle
import scipy.sparse

import sklearn.linear_model as linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [69]:
df_train = scipy.sparse.load_npz("../data/train_sparse.npz")
df_test = scipy.sparse.load_npz("../data/test_sparse.npz")
y_train = pd.read_csv("../data/y_train.csv", index_col=0)
y_test = pd.read_csv("../data/y_test.csv", index_col=0)
with open("../artefacts/dv.pkl", "rb") as file_pkl:
    dv = pickle.load(file_pkl)

# Let's try simple LinearRegression

In [71]:
model = linear_model.LinearRegression()

In [72]:
model.fit(df_train, y_train)

In [73]:
print(f"train error {mean_squared_error(model.predict(df_train), y_train)**0.5}")
print(f"test error {mean_squared_error(model.predict(df_test), y_test)**0.5}")


train error 1.3638562419387175
test error 1.6411370816974096


Very good score, I even think, that's too good result, let's look carefully

### First row

In [95]:
print(f"prediction {model.predict(df_train[0,:])}")
print(f"y {y_train.iloc[0]}")

prediction [[53658.08802239]]
y car_purchase_amount    53655.53859
Name: 131, dtype: float64


In [162]:
model.intercept_

array([44385.16724741])

In [103]:
pd.Series(model.coef_[0], index=dv.feature_names_)

age                                           6705.239000
agriculture                                      0.149742
annual_salary                                 6554.524683
arable_                                         -0.016466
area_sq_mi                                       0.092878
birthrate                                       -0.310494
climate                                          0.087227
coastline_coastarea_ratio                        0.026812
credit_card_debt                                 0.004555
crops_                                           0.076205
customer_email_suffix=.ca                       -0.021120
customer_email_suffix=.co.uk                     0.152051
customer_email_suffix=.com                      -0.026218
customer_email_suffix=.edu                      -0.085972
customer_email_suffix=.net                      -0.001229
customer_email_suffix=.org                      -0.017512
deathrate                                       -0.044697
gdp__per_capit

So, there is three very correlated features, that gives us very good perfomance.  

I think our date is syntatic, so that's why we have such results.

# Try to use simple linear combindation of three best featues

In [140]:
print(dv.feature_names_.index("age"))
print(dv.feature_names_.index("annual_salary"))
print(dv.feature_names_.index("net_worth"))

0
2
23


In [184]:
def linear_predict(x):
    y = 44385 + x[0] * 6705.239000 + x[2] * 6554.524683 + x[23] * 4972.698404
    return (y)

In [185]:
first_row = df_train.toarray()[0,:]

In [186]:
model.predict([first_row])

array([[53658.08802239]])

In [187]:
linear_predict(first_row)

53657.35394271994

In [188]:
y_train.iloc[0]

car_purchase_amount    53655.53859
Name: 131, dtype: float64

In [189]:
linear_pred_train = np.apply_along_axis(linear_predict, 1, df_train.toarray())
linear_pred_test = np.apply_along_axis(linear_predict, 1, df_test.toarray())

In [190]:
print(f"train error {mean_squared_error(linear_pred_train, y_train)**0.5}")
print(f"test error {mean_squared_error(linear_pred_test, y_test)**0.5}")

train error 1.5028203917565794
test error 1.7293862266939801


# Try random forest

In [128]:
forest = RandomForestRegressor(n_estimators=100)

In [135]:
forest.fit(df_train, y_train.values[:,0])

In [137]:
print(f"train error {mean_squared_error(forest.predict(df_train), y_train)**0.5}")
print(f"test error {mean_squared_error(forest.predict(df_test), y_test)**0.5}")


train error 1284.188323055143
test error 4624.180238987961


In [136]:
print(f"prediction {forest.predict(df_train[0,:])}")
print(f"y {y_train.iloc[0]}")

prediction [54268.574009]
y car_purchase_amount    53655.53859
Name: 131, dtype: float64


In [139]:
pd.Series(forest.feature_importances_, index=dv.feature_names_)

age                                           0.451002
agriculture                                   0.003406
annual_salary                                 0.301226
arable_                                       0.003279
area_sq_mi                                    0.003130
birthrate                                     0.002504
climate                                       0.001255
coastline_coastarea_ratio                     0.004129
credit_card_debt                              0.004436
crops_                                        0.004274
customer_email_suffix=.ca                     0.000379
customer_email_suffix=.co.uk                  0.000886
customer_email_suffix=.com                    0.000706
customer_email_suffix=.edu                    0.000763
customer_email_suffix=.net                    0.000429
customer_email_suffix=.org                    0.000286
deathrate                                     0.005816
gdp__per_capita                               0.002040
gender    