In [1]:
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold
from sklearn import preprocessing
from model import *

## Read data and preprocess

In [2]:
# Read data and shuffle
data = pd.read_csv('airfoil_self_noise.dat',sep='\t',names=['frequency','angle_of_attack','chord_length','velocity','thickness','pressure_level'])
data = shuffle(data,random_state=100).reset_index(drop=True)
data.head()

Unnamed: 0,frequency,angle_of_attack,chord_length,velocity,thickness,pressure_level
0,400,9.9,0.1524,31.7,0.025278,128.939
1,1600,2.7,0.1524,71.3,0.002439,130.644
2,800,15.6,0.1016,71.3,0.043726,124.188
3,4000,9.9,0.1524,71.3,0.0193,115.079
4,20000,0.0,0.2286,71.3,0.002143,114.474


In [3]:
# Normalize data and split into train test by 80/20 ratio
X = preprocessing.normalize(data.iloc[:,:-1].values)
y = data.pressure_level.values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=100)

# K-fold cross validation to test performance
- We will proceed to choose one of the following models: linear regression, polynomial regression with degree 2, 4 or 8
- We will employ k-fold cross validation on train set and looking at the errors to determine the model, then train on all train set and predict on the held out test set

In [4]:
# Perform KFold cross validation to choose the model
kf = KFold(n_splits=5)

# Models (degree 1 = normal linear regression)
degrees = [1,2,4,8]
print("Performing 5-fold cross validation across 4 different models:")
average_test_errors = []
average_train_errors = []
for degree in degrees:
    train_errors = []
    test_errors = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X_train):
        # Split to train and test for this fold
        X_train_kfold, X_val = X_train[train_index], X_train[val_index]
        y_train_kfold, y_val = y_train[train_index], y_train[val_index]

        # Create polynomial features
        if degree != 1:
            poly_reg = PolynomialFeatures(degree=degree)
            X_train_poly = poly_reg.fit_transform(X_train_kfold)
            X_test_poly = poly_reg.fit_transform(X_val)
        else:
            X_train_poly = X_train_kfold
            X_test_poly = X_val
            
        # Train LR
        lr = LinearRegressionExplicit()
        lr.fit(X_train_poly,y_train_kfold)

        # Predict and error
        train_pred = lr.predict(X_train_poly)
        test_pred = lr.predict(X_test_poly)

        # Save the MSE of this fold
        train_errors.append(regression_error(train_pred,y_train_kfold))
        test_errors.append(regression_error(test_pred,y_val))

    # Average the MSE across 5 fold for each mapping
    average_test_errors.append(np.average(test_errors))
    average_train_errors.append(np.average(train_errors))

for i, (average_train_error, average_test_error) in enumerate(zip(average_train_errors,average_test_errors)):
    if degrees[i] == 1:
        print(f"Linear Regression Model:")
    else:
        print(f"Polynomial Model with Degree {degrees[i]}:")
    print(f"\tAverage training MSE: {average_train_error}")
    print(f"\tAverage testing MSE: {average_test_error}")

Performing 5-fold cross validation across 4 different models:
Linear Regression Model:
	Average training MSE: 36.570072184033386
	Average testing MSE: 37.09543505449684
Polynomial Model with Degree 2:
	Average training MSE: 27.41044943694407
	Average testing MSE: 28.67781185993119
Polynomial Model with Degree 4:
	Average training MSE: 24.529072930700657
	Average testing MSE: 28.144484707152678
Polynomial Model with Degree 8:
	Average training MSE: 22.519528963256285
	Average testing MSE: 32.710681513385396


In [5]:
# Train on whole training set and predict on the held out test set
poly_reg = PolynomialFeatures(degree=4)
X_train_poly = poly_reg.fit_transform(X_train)
X_test_poly = poly_reg.fit_transform(X_test)

# Train LR on the polynomial features
lr = LinearRegressionExplicit()
lr.fit(X_train_poly,y_train)

pred = lr.predict(X_test_poly)
print(f"MSE on the held out test set: {regression_error(pred,y_test)}")

MSE on the held out test set: 30.78141511898778
