<a href="https://colab.research.google.com/github/ralbu85/STML/blob/main/Ch5_Cross_Validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import dataset using pandas

In [None]:
import pandas as pd
df = pd.read_csv('Auto.csv')
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [None]:
import numpy as np
df = df.replace('?',np.nan).dropna() # replace ? into NaN and drop datapoint that has NaN value

## Preprocess the dataset

In [None]:
df2 = pd.get_dummies(df, columns = ['origin'], dtype=int)
df2

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,name,origin_1,origin_2,origin_3
0,18.0,8,307.0,130,3504,12.0,70,chevrolet chevelle malibu,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,buick skylark 320,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,plymouth satellite,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,amc rebel sst,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,ford torino,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,ford mustang gl,1,0,0
393,44.0,4,97.0,52,2130,24.6,82,vw pickup,0,1,0
394,32.0,4,135.0,84,2295,11.6,82,dodge rampage,1,0,0
395,28.0,4,120.0,79,2625,18.6,82,ford ranger,1,0,0


## Define X, y

In [None]:
y = df2['mpg']
X = df2.drop(columns=['mpg','name']) # drop the columns and save the remaining to X

## Validation-set Approach

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,#Dataset to be splitted
                                                    test_size = 0.3,# ratio for testing dataset
                                                    shuffle=True) # Whether to randomly mix the dataset

## Model Learning using training dataset

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train) # Learning the model using Only Training Dataset

## Evaluation using Testing Dataset

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = reg.predict(X_test) # prediction using testing dataset
mse = mean_squared_error(y_test, y_pred) # model evaluation using TESTING Dataset
r2 = r2_score(y_test, y_pred)

mse, r2

(8.019315827866517, 0.8383476029218481)

## K-fold Cross Validation

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 3, shuffle = True) # make a data split object

In [None]:
mse_result = []
r2_result = []
for train_idx, test_idx in kfold.split(X,y):
    # For each repetition, pick up Training/Testing dataset
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Model learning with training dataset
    reg = LinearRegression()
    reg.fit(X_train,y_train)

    # Make a prediction for testing dataset
    y_pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred) # mse
    r2 = r2_score(y_test,y_pred) # r2

    # append the each repetition's result into list
    mse_result.append(mse)
    r2_result.append(r2)

In [None]:
mse_result, r2_result

([7.251298804218264, 14.150515318309456, 12.668886268696571],
 [0.8676829385010586, 0.7984214843973536, 0.7780466274807164])

## K-fold cross_validation using scikitlearn

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
model = LinearRegression() # model object
kfold = KFold(n_splits=5, shuffle=True) # data splliting object
metrics = ['neg_mean_squared_error','r2'] # list of metrics to use for CV

result = cross_validate(estimator=model, # model to use
                        X=X, y=y, # specify the dataset
                        cv=kfold, # data splitting objects
                        scoring=metrics, # what metrics will be used for CV
                        return_train_score = True) # whether to return training error

In [None]:
result = pd.DataFrame(result)

In [None]:
test_r2 = result['test_r2'].mean()
test_mse = result['test_neg_mean_squared_error'].mean() # average out the metrics

test_r2, test_mse

(0.8043738380857605, -11.583831767816363)