# 4 Pre-Processing and Training Data<a id='4_Pre-Processing_and_Training_Data'></a>

## 4.1 Contents

## 4.2 Introduction

## 4.3 Imports

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression


## 4.4 Load Data

In [3]:
ms_sec = pd.read_pickle('../data/train_ms_section.pkl')
us_sec = pd.read_pickle('../data/train_us_section.pkl')
print(ms_sec.head().T)
print(us_sec.head().T)

                 0          1          2          3          4
ttf_ms      1.3915     1.3925     1.3936     1.3947     1.3957
mean       4.58207    5.05908    4.91309    4.75464    4.66553
median           5          5          5          5          5
std        2.67334    3.18266    3.48529    6.27187    2.75112
iqr              3          4          4          7          4
...            ...        ...        ...        ...        ...
f_sec6    0.143069   0.260637   0.440955    0.53825   0.226379
f_sec7    0.119162   0.133923   0.171251   0.125679   0.147104
f_sec8   0.0647039  0.0645654  0.0446885  0.0287065   0.081306
f_sec9    0.110163   0.056461  0.0386167  0.0381988  0.0727754
f_sec10   0.084086  0.0514141  0.0432133  0.0164302  0.0890251

[69 rows x 5 columns]
                 0          1         2          3          4
ttf_us      1.3915     1.3915    1.3925     1.3925     1.3925
mean       4.80978    4.29705   5.52632    4.75028    4.89439
median           5          4      

## 4.6 Train/Test Split

In [17]:
t_features = [col for col in us_sec.iloc[:,1:].columns if (col[0:2] != "f_") & (col != "freq") & (col != "psd")]
f_features = [col for col in us_sec.iloc[:,1:].columns if col[0:2] == "f_"]

y = us_sec.ttf_us
X = us_sec.loc[:,f_features]
X.head().T


Unnamed: 0,0,1,2,3,4
f_dom,49407110.0,70088640.0,47846890.0,65006500.0,56005600.0
f_sec1,0.06360166,0.0721552,4.7365740000000006e-33,0.03024468,0.01810273
f_sec2,0.04750125,0.04901716,0.0,0.03369577,0.03136409
f_sec3,0.06200338,0.09898938,0.0,0.09525854,0.09126262
f_sec4,0.06135241,0.1019314,0.0,0.1361151,0.151014
f_sec5,0.2546645,0.1444582,0.8855972,0.08801925,0.1097193
f_sec6,0.1408196,0.135233,0.0,0.1391692,0.3809139
f_sec7,0.1106803,0.04213827,0.0,0.2324192,0.07594299
f_sec8,0.05348384,0.1532,0.0,0.09430336,0.05037014
f_sec9,0.1230435,0.1149851,0.0,0.0420201,0.05258808


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=47)

In [19]:
X_train.shape, X_test.shape

((352, 11), (88, 11))

In [20]:
y_train.shape, y_test.shape

((352,), (88,))

## 4.7 Initial Not-Even-A-Model<a id='4.7_Initial_Not-Even-A-Model'></a>

A good place to start is to see how good the mean is as a predictor. In other words, what if you simply say your best guess is the average price?

In [26]:
#Calculate the mean of `y_train`
train_mean = y_train.mean()
train_mean

1.429944153409091

`sklearn`'s `DummyRegressor` easily does this:

In [27]:
#Fit the dummy regressor on the training data
#Hint, call its `.fit()` method with `X_train` and `y_train` as arguments
#Then print the object's `constant_` attribute and verify it's the same as the mean above
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[1.42994415]])

In [35]:
y_tr_pred = train_mean * np.ones(len(y_train))
y_te_pred = train_mean * np.ones(len(y_test))

### 4.7.1 R-squared<a id='4.7.2.0.1_R-squared'></a>

In [36]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -0.02331881097419397)

##### 4.7.2.0.2 Mean absolute error<a id='4.7.2.0.2_Mean_absolute_error'></a>

In [37]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.019063018788739672, 0.021307122417355353)

##### 4.7.2.0.3 Mean squared error<a id='4.7.2.0.3_Mean_squared_error'></a>

In [38]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(0.0004964496395276021, 0.0005533168989370142)

How good is this? How closely does this match, or explain, the actual values? There are many ways of assessing how good one set of values agrees with another, which brings us to the subject of metrics.

## 4.8 Initial Models

### 4.8.1 Random Forest Regression

#### 4.8.1.1 Define the pipeline<a id='4.10.1_Define_the_pipeline'></a>

In [21]:
RF_pipe = make_pipeline(
    StandardScaler(),
    RandomForestRegressor(random_state=47)
)

#### 4.8.1.2 Fit and assess performance using cross-validation<a id='4.10.2_Fit_and_assess_performance_using_cross-validation'></a>

In [22]:
#Pass it the random forest pipe object, `X_train` and `y_train`,
#and get it to use 5-fold cross-validation
rf_default_cv_results = cross_validate(RF_pipe, X_train, y_train, cv=5)

In [23]:
rf_cv_scores = rf_default_cv_results['test_score']
rf_cv_scores

array([-0.17551192, -0.24740566, -0.17814721, -0.23774978, -0.2477249 ])

In [24]:
np.mean(rf_cv_scores), np.std(rf_cv_scores)

(-0.21730789651467836, 0.03325478191175324)

### 4.8.2 Linear Regression

In [41]:
LR_pipe = make_pipeline(
    StandardScaler(), 
    LinearRegression()
)

In [42]:
#Call the pipe's `fit()` method with `X_train` and `y_train` as arguments
LR_pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])

#### 4.8.2.3 Make predictions on the train and test sets<a id='4.8.2.3_Make_predictions_on_the_train_and_test_sets'></a>

In [43]:
y_tr_pred = LR_pipe.predict(X_train)
y_te_pred = LR_pipe.predict(X_test)

#### 4.8.2.4 Assess performance<a id='4.8.2.4_Assess_performance'></a>

In [44]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.033585889002502034, -0.020005953760202422)

In [46]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(0.01894592085395954, 0.021209013971501243)