Selçuk Eşkil

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('HW1DataSet.csv') #import dataset
dataset

Unnamed: 0,x,y
0,-4.809264,-92.911272
1,-4.722680,-98.506658
2,-3.852326,-63.212666
3,-3.687668,-51.036538
4,-3.602674,-54.784580
...,...,...
95,6.890595,39.282364
96,7.118811,49.384561
97,7.178848,51.564875
98,7.942389,85.676290


In [2]:
y=dataset['y'].values                 
x=dataset['x'].values.reshape(-1,1) 

from sklearn.model_selection import train_test_split #splitting dataset to train and test
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2)

## Leave One Out

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

def create_polynomial_regression_model(degree):
  "Creates a polynomial regression model for the given degree"
  
  poly_features = PolynomialFeatures(degree=degree,include_bias=False)
  
  # transforms the existing features to higher degree features.
  X_train_poly = poly_features.fit_transform(X_trainv)
  
  # fit the transformed features to Linear Regression
  poly_model = LinearRegression()
  poly_model.fit(X_train_poly, Y_trainv)
  
  # predicting on training data-set
  y_train_predicted = poly_model.predict(X_train_poly)
  
  # predicting on validation data-set
  y_valid_predict = poly_model.predict(poly_features.fit_transform(X_valid))
  
  # evaluating the model on training dataset
  mse_train = mean_squared_error(Y_trainv, y_train_predicted)
  
  # evaluating the model on validation dataset
  mse_valid = mean_squared_error(Y_valid, y_valid_predict)
   
  return (mse_train,mse_valid,degree)

In [4]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
LeaveOneOutCV=[]
for j in range(1,8):
    LeaveOneOutErrors=[]
    for train_index, validation_index in loo.split(X_train):
        X_trainv, X_valid = X_train[train_index],X_train[validation_index]
        Y_trainv, Y_valid = Y_train[train_index], Y_train[validation_index]
        LeaveOneOutErrors.append(create_polynomial_regression_model(j))
        df = pd.DataFrame (LeaveOneOutErrors,columns=['Training Error','Validation Error','Degree'])
    LeaveOneOutCV.append([df['Validation Error'].mean(),df['Degree'][0]])  
LeaveOneOutCV_df = pd.DataFrame (LeaveOneOutCV,columns=['Validation Error','Degree'])
LeaveOneOutCV_df 

Unnamed: 0,Validation Error,Degree
0,519.870758,1
1,614.15885,2
2,10.598922,3
3,12.860215,4
4,23.111653,5
5,40.380581,6
6,32.905439,7


The one that has the smallest LOOCV mean squared error is the 3rd degree polynomial regression.

In [5]:
#Refitting model on the training set with the 3rd degree of polynomial

poly_features = PolynomialFeatures(degree=3,include_bias=False)  
X_train_poly = poly_features.fit_transform(X_train)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, Y_train)
X_test_poly = poly_features.fit_transform(X_test)
Y_test_predict=poly_model.predict(X_test_poly) #predicting Y values from X test values
mse_test = mean_squared_error(Y_test, Y_test_predict) #calculating MSE on the test set
print(mse_test) 

7.094673500014041


## K-Folds

In [6]:
from sklearn.model_selection import KFold

#trying different degrees of polynomial functions

cv = KFold(n_splits=5, random_state=42, shuffle=True)  
CVErrors=[]
for train_index, validation_index in cv.split(X_train):
    X_trainv, X_valid, Y_trainv, Y_valid = X_train[train_index], X_train[validation_index], Y_train[train_index], Y_train[validation_index]
    for j in range(1,8):
        CVErrors.append(create_polynomial_regression_model(j))
        df = pd.DataFrame (CVErrors,columns=['Training Error','Validation Error','Degree'])

In [7]:
#group by degrees of polynomials and take the mean of validation errors

kfoldCV_by_degree = df.groupby("Degree")
kfoldCV_by_degree = kfoldCV_by_degree.mean()
kfoldCV_by_degree = kfoldCV_by_degree.reset_index()
kfoldCV_by_degree[['Degree', 'Validation Error']]

Unnamed: 0,Degree,Validation Error
0,1,531.757578
1,2,743.103141
2,3,10.109716
3,4,11.492268
4,5,20.802261
5,6,51.951595
6,7,65.79165


The one that has the smallest K-fold validation mean squared error is the 3rd degree polynomial regression.

In [8]:
#Refitting model on the training set with the 3rd degree of polynomial

poly_features = PolynomialFeatures(degree=3,include_bias=False)  
X_train_poly = poly_features.fit_transform(X_train)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, Y_train)
X_test_poly = poly_features.fit_transform(X_test)
Y_test_predict=poly_model.predict(X_test_poly) #predicting Y values from X test values
mse_test = mean_squared_error(Y_test, Y_test_predict) #calculating MSE on the test set
print(mse_test)

7.094673500014041


In [9]:
r2 = r2_score(Y_test, Y_test_predict) #calculating R^2 score for test set
r2 

0.9271219482334372

The degrees of polynomials chosen with LOOCV and 5-fold cross
validation are same, both 3. 