In [1]:
import numpy as np
import pandas as pd

In [2]:
def CV(df, CV_index):
    '''
    input: df:dataframe, 
           CV_index: int {0:9}
    return train_dataframe, test_dataframe
    '''
    if CV_index != 9:
        CV_size = 8
        train = pd.concat([df.iloc[:8*CV_index],df.iloc[8*(CV_index+1):]])
        test = df.iloc[8*CV_index:8*(CV_index+1),:]
    else:
        CV_size = 7
        train = df.iloc[:-CV_size]
        test = df.iloc[-CV_size:]
    return train,test

def normalEquation(X, y):
    _num_of_samples = X.shape[0]
    # Join all 1 columns on the most left side of x
    X_0 = np.ones([_num_of_samples, 1])
    X = np.column_stack((X_0,X))
    return np.linalg.inv(X.transpose().dot(X)).dot(X.transpose()).dot(y)

def model1_prediction(x_train, y_train, x_test):
    """
    train a linear regression model on x_train and y_train
    and make prediction on x_test
    return: prediction
    """
    w = normalEquation(x_train, y_train)
    
    
    samples = x_test.shape[0]
    # Join all 1 columns on the most left side of x
    x_0 = np.ones([samples, 1])
    x_test = np.column_stack((x_0,x_test))
    
    x_pre = x_test
    y_pre = x_pre.dot(w)
    return y_pre

def model2_prediction(x_train, y_train, x_test):
    """
    train a linear regression model on x_train and y_train
    and make prediction on x_test
    return: prediction
    """
    
    w2 = normalEquation(x_train.iloc[:,0:2], y_train)
    
    
    samples = x_test.shape[0]
    # Join all 1 columns on the most left side of x
    x_0 = np.ones([samples, 1])
    
    x_1to2 = x_test.iloc[:,0:2]
    x_test2 = np.column_stack((x_0,x_1to2))
    
    x_pre2 = x_test2
    y_pre2 = x_pre2.dot(w2)
    return y_pre2

def SSE(y_pre, y_test):
    temp = y_test - y_pre
    SSE = np.dot(temp.T,temp)
    
    return SSE

def CV_par(train,test):
    y_train = train.pop('y')
    x_train = train
    y_test = test.pop('y')
    x_test = test
    return x_train,y_train,x_test,y_test

def SSE_CV_two_models(df):
    """
    return SSE_CV for model 1 and model 2
    """
    model1_SSEs = []
    model2_SSEs = []
    for cv_ind in range(10):
        train,test = CV(df,cv_ind)
        x_train,y_train,x_test,y_test = CV_par(train,test)
        model1_pred = model1_prediction(x_train, y_train, x_test)
        model2_pred = model2_prediction(x_train, y_train, x_test)
        model1_sse = SSE(model1_pred, y_test)
        model2_sse = SSE(model2_pred, y_test)
        model1_SSEs.append(model1_sse)
        model2_SSEs.append(model2_sse)
        
    return sum(model1_SSEs), sum(model2_SSEs)

In [3]:
from sklearn.utils import shuffle

df = pd.read_csv('./Lab2.csv')

result = {}
result['model1'] = []
result['model2'] = []
for random_state in range(20):
    df = shuffle(df,random_state = random_state)
    model1_SSE_CV, model2_SSE_CV = SSE_CV_two_models(df)
    result['model1'].append(model1_SSE_CV)
    result['model2'].append(model2_SSE_CV)
    
result_df = pd.DataFrame(result)

In [4]:
result_df

Unnamed: 0,model1,model2
0,30258.761731,25564.57479
1,30622.26269,25499.294032
2,32913.307699,24872.283182
3,30430.251719,24847.655641
4,33384.185866,26367.432769
5,30500.246577,25618.971509
6,30949.884682,25856.402196
7,30631.122279,25566.046807
8,31173.865081,25506.907274
9,30654.278733,25563.191235


In [5]:
import matplotlib.pyplot as plt

In [6]:
result_df.mean()

model1    30923.227719
model2    25685.081310
dtype: float64