In [16]:
import pandas as pd
import numpy as np
#lin reg b = (X'X)^(-1)X'Y
#split data into training and testing set
#normalize training data only (z-score(mu,std) or minmax)
#train it
#normalize test data by using parameters (e.g. mu and std) from step 2

#First split into train and test THEN fill NA by mean!!!
#or we can simply drop the rows with missing values...

filename = 'auto-mpg.data'
col_names =  ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 
                                            'model year', 'origin', 'car name']
X = pd.read_fwf(filename, names = col_names)
X = X.drop('car name', axis = 'columns') #dropping the features car name
X = X.sample(frac=1) #randomize the dataset

In [4]:
#loads data with the given column names and drops the last column
#it also converts dataframe to np arrays
#First split into train and test THEN fill NA by mean!!!
def impute_data(X):
    
    X = X.replace('?', np.NaN) #replaces '?' in the dataset with NaN
    X = X.astype(float) #converts the whole dataset to float
    
    cols_with_missing = [col for col in X if X[col].isna().any() ]
    X[cols_with_missing] = X[cols_with_missing].fillna(X[cols_with_missing].mean()) #fills NaN with mean of the feature horsepower
    
    #X = X.sample(frac=1)
    X = np.array(X) #converts the X dataframe to np.array for easier computation
    return X

In [5]:
#Writing splits as function, input N fold and experiment number
def N_Fold_CV(data, index):
    #data = full_data
    #select samples for training and testing 
    X_test = data.iloc[40*index: 40*(index+1),:] #398 data points, roughly 40 rows for testing
    rows_to_drop = list(X_test.index.values)
    X_train = data.drop(rows_to_drop)
    Y_test = X_test['mpg'] #mpg column
#    X_train = np.delete(data, slice(40*index, 40*(index+1)), axis = 0)
    Y_train = X_train['mpg'] #mpg column
    return X_train, Y_train, X_test, Y_test

In [6]:
#this function normalizes each column and saves the parameter for normalizing test
def normalize(data):
    param = []
    for i in range(data.shape[1]): #exclude the column with ones
        x = data[:,i].mean() #gets the mean of each column
        y = data[:,i].std() #gets the standard deviation of each column
        param.append((x,y)) 
        data[:,i] = (data[:,i] - x)/y
    return param, data

In [7]:
def normalize_test(data, train_param):
    for i in range(data.shape[1]):
        x = train_param[i][0] #gets the mean of each column
        y = train_param[i][1] #gets the standard deviation of each column
        data[:,i] = (data[:,i] - x)/y
    return data

In [8]:
#lin reg b = (X'X)^(-1)X'Y
def lin_reg(X,Y):
    a = np.matmul(X.transpose(),X) #X'X
    b = np.linalg.inv(a) #(X'X)^(-1)
    c = np.matmul(b,X.transpose()) #(X'X)^(-1)X'
    d = np.matmul(c,Y) #(X'X)^(-1)X'Y
    return d

In [9]:
def RMSE(X_test, Y_test, b):
    square = np.power(Y_test - np.dot(X_test, b),2)
    s = sum(square)
    return np.power(s, 1/2)

In [10]:
def compute_R2(test):
    test_mean = test.mean()
    SSTo = sum(np.power(test - test_mean,2))
    return np.power(SSTo,1/2)

In [19]:
for N in range(10):
    X_train, Y_train, X_test, Y_test = N_Fold_CV(X,N)
    X_train, X_test = impute_data(X_train), impute_data(X_test)
    Y_train, Y_test = np.array(Y_train), np.array(Y_test)
    train_param, X_train_norm = normalize(X_train)
    Y_train_norm = X_train_norm[:,0]
    X_train_norm = X_train_norm[:,1:]
    X_mat = np.insert(X_train_norm,0,1,axis=1)
    b = lin_reg(X_mat, Y_train_norm)
    X_test_norm = normalize_test(X_test, train_param)
    Y_test_norm = X_test_norm[:,0]
    X_test_norm = X_test_norm[:,1:]
    X_test_mat = np.insert(X_test_norm,0,1,axis=1)
    print(f'Fold ${N+1}$ & ${round(b[1],4)}$ & ${round(b[2],4)}$ & ${round(b[3],4)}$ & ${round(b[4],4)}$ & ${round(b[5],4)}$ & ${round(b[6],4)}$ & ${round(b[7],4)}$ & ${round(RMSE(X_test_mat, Y_test_norm, b),4)}$ \\\\\hline')
    #    print(f'Fold ${N+1}$ & $R^2 = {1 - np.power(RMSE(X_test_mat, Y_test_norm, b),2)/np.power(compute_R2(Y_test_norm),2)}$\\\\\hline')

Fold $1$ & $-0.0673$ & $0.2077$ & $-0.0498$ & $-0.7083$ & $0.0344$ & $0.3618$ & $0.1524$ & $2.6198$ \\\hline
Fold $2$ & $-0.0908$ & $0.294$ & $-0.0733$ & $-0.735$ & $0.0474$ & $0.3652$ & $0.1417$ & $2.418$ \\\hline
Fold $3$ & $-0.1362$ & $0.2816$ & $-0.0432$ & $-0.7501$ & $0.054$ & $0.3476$ & $0.112$ & $3.1042$ \\\hline
Fold $4$ & $-0.0732$ & $0.2216$ & $-0.0404$ & $-0.7319$ & $0.0362$ & $0.3616$ & $0.1426$ & $2.8051$ \\\hline
Fold $5$ & $-0.1091$ & $0.3113$ & $-0.0685$ & $-0.7433$ & $0.0293$ & $0.3672$ & $0.1506$ & $2.0094$ \\\hline
Fold $6$ & $-0.0496$ & $0.1368$ & $-0.0642$ & $-0.6664$ & $0.0086$ & $0.3583$ & $0.1486$ & $3.1907$ \\\hline
Fold $7$ & $-0.1101$ & $0.263$ & $-0.0266$ & $-0.7481$ & $0.038$ & $0.3577$ & $0.1547$ & $3.3534$ \\\hline
Fold $8$ & $-0.0727$ & $0.2286$ & $-0.0512$ & $-0.7238$ & $0.0393$ & $0.3619$ & $0.1427$ & $2.1895$ \\\hline
Fold $9$ & $-0.1141$ & $0.2971$ & $-0.0872$ & $-0.7292$ & $0.0303$ & $0.3483$ & $0.1554$ & $3.1671$ \\\hline
Fold $10$ & $-0.0901$ & $0