In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn import linear_model, preprocessing, model_selection
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

![img](https://algotrading101.com/learn/wp-content/uploads/2020/06/training-validation-test-data-set.png)



In Machine Learning, the primary way we select the best model is through the use of test sets and cross-validation.

The idea of a test set works as follows:

- Take all of the available data and split it into two parts - a training set and a test set.
- Using only the training portion, you will estimate the parameters of several competing models.
- 



# train_test_split

This function takes a list of arrays and splits each array into two arrays (a training set and a test set) by randomly selecting rows or values.

In [None]:
# x is our predictor matrix
# y is a numeric output - for regression methods
# z is a categorical output - for classification methods
X = np.arange(20).reshape((2, -1)).T
y = np.arange(10)
z = np.array([0,0,0,0,0,1,1,1,1,1])
print(X)
print(y)
print(z)

We can use train_test_split on each array individually.

It returns a tuple that can be unpacked into train and test arrays.

In [None]:
X_train, X_test = train_test_split(X, test_size = 1/4, random_state = 1)
print(X_train)
print(X_test)

In [None]:
y_train, y_test = train_test_split(y, test_size = 1/4, random_state = 1)
print(y_train)
print(y_test)

We can also apply it to multiple arrays simultaneously.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/4, random_state = 1)
print(X_train)
print(X_test)
print(y_train)
print(y_test)

In [None]:
## if you have a categorical variable, the stratify argument ensures 
# that you'll get an appropriate number of each category in the resulting split
X_train, X_test, z_train, z_test = train_test_split(X, z, test_size = 1/4, random_state = 1, 
                                                    stratify = z)
print(X_train)
print(X_test)
print(z_train)
print(z_test)

# Iron Slag

magnetic test is cheaper. chemical test is more accurate.

Can we use the magnetic test to predict the chemical test result?

X = magnetic test result
y = chemical test


In [None]:
iron = pd.read_csv('ironslag.csv')
iron.head()

In [None]:
iron.shape

## Plot of the full dataset

In [None]:
plt.scatter(iron.magnetic, iron.chemical)

## Create a hold-out set using train-test split

In [None]:
train, test = train_test_split(iron, test_size = 1/5, random_state = 1)

In [None]:
train

In [None]:
test

## Plot of the training data

In [None]:
plt.scatter(train.magnetic, train.chemical)

## We will use only the training data to try out possible models

In [None]:
# sklearn requires our predictor variables to be in a two dimensional array
# reshape to have 1 column
# the -1 in reshape means I don't want to figure out all the necessary dimensions
# i want 1 column, and numpy, you figure out how many rows I need
X = train.magnetic.values.reshape(-1,1)
X.shape

In [None]:
y = train.chemical.values
y.shape

In [None]:
np.corrcoef(train.magnetic.values, train.chemical.values)

In [None]:
# r-squared
np.corrcoef(train.magnetic.values, train.chemical.values)[0,1] ** 2

# Fit a linear model between x and y

In [None]:
linear = linear_model.LinearRegression()

In [None]:
linear.fit(X, y)

In [None]:
# linear.score is the R^2 value
# how much error is reduced from no model (variance or MSE)
# vs having the regression model
linear.score(X, y)

In [None]:
x_predict = np.arange(10, 40).reshape(-1,1)

In [None]:
lin_y_hat = linear.predict(x_predict)

In [None]:
plt.scatter(X, y)
plt.plot(x_predict, lin_y_hat, c = 'red')

### Cross Validation

In [None]:
# shuffle split says 'shuffle the data' and split it into 5 equal parts
cv = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.3, random_state=0)
cv_linear = model_selection.cross_val_score(linear, X, y, cv = cv)
print(cv_linear)
print(np.mean(cv_linear))

Technically, the above is all you need to do. But I went ahead and wrote this loop which fits the model on the training data, and makes predictions for the validation data. 

In each plot, the light blue dots are the training data.

The green dots are the validation data.

The flat green line is the mean of the validation data. That would be the prediction if no model was fit.

The red line is the linear model that was trained on the training data. We hope that the red line does a better job of predicting the green points than the green line. In some cases, it does not, and we actually get a negative cross-validation score.

In [None]:
for train_index, test_index in cv.split(X):
    # create a subset of the data using the training cases in cross validation
    tX = X[train_index, : ]
    ty = y[train_index]
    
    # initialize and fit a new linear regression model
    clin = linear_model.LinearRegression()
    
    # fit only on the training cases
    clin.fit(tX, ty)
    
    # create a subset for the test cases
    testX = X[test_index, :]
    testy = y[test_index]
    
    # plot the training data in blue and the fitted line in red
    plt.scatter(tX, ty, c = 'blue', alpha = 0.5)
    plt.plot(x_predict, clin.predict(x_predict), c = 'red')
    plt.show()
    
    # plot the test cases in green against the fitted line
    plt.scatter(testX, testy, c = 'green')
    plt.plot(x_predict, clin.predict(x_predict), c = 'red')
    plt.plot(x_predict, np.repeat(np.mean(testy), len(x_predict)), c = 'lightgreen')
    
    plt.show()
    
    # the MS of having no model = variance of the test data
    mse = np.var(testy)
    print("MS Error: " + str(mse))
    
    # the MS regression
    msr = sum((testy - clin.predict(testX))**2)/len(testy)
    print("MS Regre: " + str(msr))
    
    # the score is the proportion of reduction by having regression
    red = (mse - msr)/mse
    print("score: " + str(red))

# Polynomial fit - quadratic

In [None]:
# preprocessing polynomial features creates a polynomial based on X
poly2 = preprocessing.PolynomialFeatures(2)

In [None]:
polyX = poly2.fit_transform(X)

In [None]:
poly2reg = linear_model.LinearRegression(fit_intercept = False)

In [None]:
poly2reg.fit(polyX, y)

In [None]:
poly2reg.score(polyX, y)

In [None]:
poly2_X_new = poly2.fit_transform(x_predict)
poly2_y_hat = poly2reg.predict(poly2_X_new)

In [None]:
plt.scatter(X, y)
plt.plot(x_predict, poly2_y_hat, c = 'red')

In [None]:
cv_quad = model_selection.cross_val_score(poly2reg, polyX, y, cv=cv)
print(cv_quad)
print(np.mean(cv_quad))

In [None]:
def polycv(degree, X, y, train_index, test_index):
    
    # create a subset of the data using the training cases in cross validation
    tX = X[train_index, : ]
    ty = y[train_index]
    
    # create a subset for the test cases
    testX = X[test_index, :]
    testy = y[test_index]
    
    poly = preprocessing.PolynomialFeatures(degree)
    polytX = poly.fit_transform(tX)
    
    # initialize and fite a new linear regression model
    clin = linear_model.LinearRegression()
    
    # fit only on the training cases
    clin.fit(polytX, ty)
    
    # plot the training data in blue and the prediction line in red
    plt.scatter(tX,ty, c = 'blue', alpha = 0.5)
    plt.plot(x_predict, clin.predict(poly.fit_transform(x_predict)), c = 'red')
    plt.show()
    
    # plot the prediction line in red against the test cases in green
    plt.scatter(testX, testy, c = 'green')
    plt.plot(x_predict, clin.predict(poly.fit_transform(x_predict)), c = 'red')
    
    # plot the mean of the test cases to show what having no model looks like
    plt.plot(x_predict, np.repeat(np.mean(testy), len(x_predict)), c = 'lightgreen')
    
    plt.show()
    
    # the MS of having no model = variance of the test data
    mse = np.var(testy)
    print("MS Error: " + str(mse))
    
    # the MS regression
    msr = sum((testy - clin.predict(poly.fit_transform(testX)))**2)/len(testy)
    print("MS Regre: " + str(msr))
    
    # the score is the proportion of reduction by having regression
    red = (mse - msr)/mse
    print("Score: " + str(red))

for train_index, test_index in cv.split(X):
    polycv(2, X, y, train_index, test_index)

# cubic fit

In [None]:
poly3 = preprocessing.PolynomialFeatures(3)

In [None]:
poly3X = poly3.fit_transform(X)
poly3reg = linear_model.LinearRegression(fit_intercept = False)
poly3reg.fit(poly3X, y)
print(poly3reg.score(poly3X,y))

The R^2 value of the cubic fit is better, but we will see with cross validation that it is not a better model. It is overfitting our data.

In [None]:
poly3_X_new = poly3.fit_transform(x_predict)
poly3_y_hat = poly3reg.predict(poly3_X_new)

In [None]:
plt.scatter(X,y)
plt.plot(x_predict, poly3_y_hat, c = 'red')

In [None]:
cv_cube = model_selection.cross_val_score(poly3reg, poly3X, y, cv=cv)
print(cv_cube)
print(np.mean(cv_cube))

In [None]:
for train_index, test_index in cv.split(X):
    polycv(3, X, y, train_index, test_index)

## higher order polynomials overfit the data: degree 4

In [None]:
# degree4
poly4 = preprocessing.PolynomialFeatures(4)
poly4X = poly4.fit_transform(X)
poly4reg = linear_model.LinearRegression(fit_intercept = False)

cv_4th = model_selection.cross_val_score(poly4reg, poly4X, y, cv=cv)
print(cv_4th)
print(np.mean(cv_4th))

In [None]:
for train_index, test_index in cv.split(X):
    polycv(4, X, y, train_index, test_index)

## degree 5

In [None]:
# degree 5
poly5 = preprocessing.PolynomialFeatures(5)
poly5X = poly5.fit_transform(X)
poly5reg = linear_model.LinearRegression(fit_intercept = False)

cv_5th = model_selection.cross_val_score(poly5reg, poly5X, y, cv=cv)
print(cv_5th)
print(np.mean(cv_5th))

In [None]:
for train_index, test_index in cv.split(X):
    polycv(5, X, y, train_index, test_index)

## Assessing models without all the graphs:



In [None]:
linear = linear_model.LinearRegression()
linear.fit(X, y)

In [None]:
# shuffle split says 'shuffle the data' and split it into 5 equal parts
cv = model_selection.ShuffleSplit(n_splits = 5, test_size = 0.3, random_state=0)
cv_linear = model_selection.cross_val_score(linear, X, y, cv = cv)
print(cv_linear)
print(np.mean(cv_linear))

In [None]:
quad = preprocessing.PolynomialFeatures(2)
quadX = quad.fit_transform(X)
quad_model = linear_model.LinearRegression()
quad_model.fit(quadX, y)

In [None]:
cv_quad = model_selection.cross_val_score(quad_model, quadX, y, cv = cv)
print(cv_quad)
print(np.mean(cv_quad))

In [None]:
cube = preprocessing.PolynomialFeatures(3)
cubeX = cube.fit_transform(X)
cube_model = linear_model.LinearRegression()
cube_model.fit(cubeX, y)

In [None]:
cv_cube = model_selection.cross_val_score(cube_model, cubeX, y, cv = cv)
print(cv_cube)
print(np.mean(cv_cube))

## Y ~ logX model

In [None]:
log_transform = preprocessing.FunctionTransformer(np.log)
logX = log_transform.fit_transform(X)
logX_model = linear_model.LinearRegression()
logX_model.fit(logX, y)

In [None]:
cv_logX = model_selection.cross_val_score(logX_model, logX, y, cv = cv)
print(cv_logX)
print(np.mean(cv_logX))