In [39]:
import pandas as pd
import numpy as np
import random
import math
import matplotlib.pyplot as plt

# Cost function

In [40]:
#Ridge Regularization
def cost_function(X, Y, W ,lamda):
    #Cost function J=(1/2m)*[(sum(pow(hypothesis(xi) - (yi)),2)) + (lamda*sum(pow(wi,2))) ]#hypothesis=X.T*W as X=data.T
    m = len(Y)
    reg_penalty=(lamda)*np.sum(np.square(W))
    J=0
    for i in range(0,m):
        x=X[i]
        y=Y[i]
        hypothesis=x.dot(W)
        J+=(pow(hypothesis-y,2))
    J+=reg_penalty
    J/=(2*m)
    return J

# Gradient descent

In [41]:
def gradient_descent(X,Y,B,lamda,alpha,prev_cost):
    #formula: Bj = Bj - alpha*(1/m)*((hypothesis(x)-y)*xj) - lamda/m*Bj
    m=len(Y)
    for j in range(0,len(B)):
        sum=0
        gradient=1
        for i in range(0,m):
            x=X[i]
            y=Y[i]
            hypothesis=x.dot(B)
            sum+=(hypothesis-y)*x[j]
        reg_penalty=lamda/m*B[j]
        gradient=sum/m
        #update Bj
        if( j == 0):
            B[j]-=(alpha*gradient)
        else:
            B[j]-=(alpha*gradient) + reg_penalty
    cost=cost_function(X,Y,B,lamda)
    #print(prev_cost,cost)
    if(abs(prev_cost - cost) <  0.000001):
        return B,cost
    else:
        B,cost=gradient_descent(X,Y,B,lamda,alpha,cost)
    return B,cost

# Prediction

In [42]:
def predict(x,B):
    ans=x.dot(B)
    return ans

In [43]:
def cal_error(X,Y,B):
    actual_list=[]
    predicted_list=[]
    m=len(Y)
    MSE=0
    MAE=0
    MPE=0
    for i in range(0,len(X)):
        actual=Y[i]
        predicted=predict(X[i],B)
        MSE+=pow((actual-predicted),2)
        MAE+=abs(actual-predicted)
        MPE+=MAE/actual*1    
    MSE/=m
    MAE/=m
    MPE/=m
    return MSE,MAE,MPE

In [44]:
def cal_MSE(X,Y,B):
    m=len(Y)
    MSE=0
    for i in range(0,len(X)):
        actual=Y[i]
        predicted=predict(X[i],B)
        MSE+=pow((actual-predicted),2)    
    MSE/=m
    return MSE

In [45]:
def cal_MSE_loocv(x,y,B):
    MSE=0
    actual=y
    predicted=predict(x,B)
    MSE+=pow((actual-predicted),2)    
    return MSE

# Linear Regression Function

In [46]:
def linear_reg_error(train_X,train_Y,B,lamda,alpha,valid_X,valid_Y):
    
    init_J=cost_function(train_X,train_Y,B,lamda)
    
    B,final_J=gradient_descent(train_X,train_Y,B,lamda,alpha,init_J)
    #print("Final_cost- ",final_J)
    #print(B)
    
    MSE=cal_MSE(valid_X,valid_Y,B)
    return MSE

#  Cross Validation

In [47]:
def kfold_cross_val_split(df,k):
    indices=df.index.tolist()
    rows=df.shape[0]
    if( rows % k != 0):
        print(k," doesnot divide data correctly Data size-",rows)
        
    fold_size= int(df.shape[0]/k)
    folds_list=[]

    for i in range(k-1):
        indices=df.index.tolist()
        random.seed(0)
        random_indices=random.sample(population=indices,k=fold_size)
        folds_list.append(df.loc[random_indices])
        df=df.drop(random_indices)
    folds_list.append(df)
    
    return folds_list

In [48]:
def run_cross_val(folds_list,label,alpha,lamda):
    k=len(folds_list)
    no_coefficients=8
    train_df=pd.DataFrame()
    valid_df=pd.DataFrame()
    error_list={}    #{error:B}
    for i in range(k):
        train_df=pd.DataFrame()
        valid_df=pd.DataFrame()
        for j in range(k):
            if( j == i):
                valid_df=folds_list[j]
            else:
                train_df=pd.concat([train_df, folds_list[j]])
        train_X,train_Y=to_arrays(train_df,label)
        valid_X,valid_Y=to_arrays(valid_df,label)
        B = np.zeros(no_coefficients)
        error=linear_reg_error(train_X,train_Y,B,lamda,alpha,valid_X,valid_Y)
        error_list[error]=B.tolist()
    return error_list

In [49]:
def run_loocv(df,label,alpha,lamda):
    k=len(df)
    no_coefficients=8
    train_df=pd.DataFrame()
    valid_df=pd.DataFrame()
    error_list={}    #{error:B}
    for i in range(k):
        train_df=pd.DataFrame()
        valid_df=pd.DataFrame()
        for j in range(k):
            if( j == i):
                valid_df=df.loc[i]
                train_df=df.drop(i)
        train_X,train_Y=to_arrays_loocv(train_df,label)
        valid_X=valid_df[:8]
        valid_Y=valid_df[8]
        B = np.zeros(no_coefficients)
        init_J=cost_function(train_X,train_Y,B,lamda)
        B,final_J=gradient_descent(train_X,train_Y,B,lamda,alpha,init_J)
        error=cal_MSE_loocv(valid_X,valid_Y,B)
        error_list[error]=B.tolist()
    return error_list

In [50]:
# #Plot graph between k and error
def plot_k_error(df,label,alpha,lamda,total_X,total_Y):
    no_coefficients=8
    k_list=[]
    y_list=[]
    list=[3, 5, 9, 10]
    for k in list:
        print("k value ",k)
        k_list.append(k)
        B = np.zeros(no_coefficients)
        folds_list=kfold_cross_val_split(df,k)
        error_list = run_cross_val(folds_list,label,alpha,lamda)
        B=sort_error(error_list)
        error=cal_MSE(total_X,total_Y,B)
        print("MSE Error ",error)
        print()
        y_list.append(error)
    #Plot Graph
    area=np.pi
    plt.xlabel("k")
    plt.ylabel("Error")

    plt.plot(k_list, y_list)
    plt.show()
    
    return

# Leave-One-Out Cross Validation

In [51]:
def loocv(df,total_X,total_Y,label,alpha,lamda):
    no_coefficients=8 
    B = np.zeros(no_coefficients)
    error_list = run_loocv(df,label,alpha,lamda)
    B=sort_error(error_list)
    error=cal_MSE(total_X,total_Y,B)/2
    return error

# Helper functions

In [52]:
#Creating X(data.T),Y arrays from df
def to_arrays(df,label):
    no_rows,no_cols=df.shape
    header=list(df.columns)
    data_array=np.ones((no_cols-1,no_rows))
    X0=np.ones(no_rows)                      #X0=1
    for i in range(0,no_cols-1):             #Removing first col-serial no , last col-label
        if(i == 0):
            data_array[i]=X0
        else:
            x=df[header[i]].values
            #data_array[i]=x
            data_array[i]=mean_normalize(x)
    X=data_array.T        #Doing Transpose

    #Y (output) array
    Y=np.array(df[label].values)
    
    return X,Y

In [53]:
#Creating X(data.T),Y arrays from df
def to_arrays_loocv(df,label):
    no_rows,no_cols=df.shape
    header=list(df.columns)
    data_array=np.ones((no_cols-1,no_rows))
    X0=np.ones(no_rows)                      #X0=1
    for i in range(0,no_cols-1):             #Removing first col-serial no , last col-label
        if(i == 0):
            data_array[i]=X0
        else:
            x=df[header[i]].values
            data_array[i]=x
            #data_array[i]=mean_normalize(x)
    X=data_array.T        #Doing Transpose

    #Y (output) array
    Y=np.array(df[label].values)
    
    return X,Y

In [54]:
def mean_normalize(x):
    x_new=np.ones(len(x))
    mean=np.mean(x)
    std=np.std(x)
    for i in range(0,len(x)):
        x_new[i]=(x[i]-mean)/(std)
    return x_new

In [55]:
def sort_error(error_list):
    B=error_list[sorted(error_list)[0]]
    return B

# Main

In [56]:
def main():
    file="AdmissionDataset/data.csv"
    label='Chance of Admit '
    no_coefficients=8  #[B0,B1,B2,...,B7] also 7 columns but X0=1
    
    df=pd.read_csv(file)
    header=list(df.columns)
    
    total_X,total_Y=to_arrays(df,label)
    
    #B(beta) array of coefficients
    #Initialized to zero
    B = np.zeros(no_coefficients)

    alpha = 0.003
    lamda = 0.01
    k=5
        
    print("Running for k=5")
    folds_list = kfold_cross_val_split(df,k)
    error_list = run_cross_val(folds_list,label,alpha,lamda)
    B=sort_error(error_list)
    print("Best parameters choosen for B")
    print(B)

    print("Testing for Error on entire dataset")
    MSE,MAE,MPE=cal_error(total_X,total_Y,B)
    print("Mean Square Error-",MSE)
    print("Mean Absolute Error-",MAE)
    print("Mean Absolute Percentage Error-",MPE)
    
    
    #Plot between lamda and error
    print("\nGraph between k and respective errors")
    plot_k_error(df,label,alpha,lamda,total_X,total_Y)
    
    #Leave-One-Out Cross Validation
    print("\nUsing Leave-One-Out Cross Validation ")
    error=loocv(df,total_X,total_Y,label,0.000003,lamda)
    print("MSE- ",error)
    
    return

In [57]:
if __name__ == "__main__":
    main()


Using Leave-One-Out Cross Validation 
MSE-  0.26950440460315483
