In [2]:
########################################################################
# Python 3.6
########################################################################

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

##### Linear regression #######

def load_data():
    """Assume dataset is a matrix without column names.
    First column should be a target value and the rest ones are features.
    Each element of it is a float or integer"""
    filename = input('Enter the data file path  (e.g.) dataharris.dat : ')
    format_select = input('Select the data coding format 1: [a b c] or 2: [a,b,c] or 3: [a  b  c]:  (e.g.) 1 : ')
    format_set = { 1:' ', 2:',', 3:'  '}
    data = np.genfromtxt(filename, delimiter=format_set[int(format_select)])
    return data

def x_y(data):
    num_y = input('Enter the column number which holds the response variable (Column number starts from 0) : ')
    y = data[:, int(num_y)]
    newdata = np.delete(data, int(num_y), axis=1)
    x = np.ones(data.shape)
    x[:, 1:] = newdata
    return(x, y)

def coefficients(x,y):
    xTx = x.T.dot(x)
    xTx_inverse = np.linalg.inv(xTx) # to find inverse matrix using numpy.linearAlgebra
    xTy = x.T.dot(y)
    beta = xTx_inverse.dot(xTy)
    return beta

def fitted_values(x, beta):
    return x.dot(beta)

def model_summary(data, y, fitted_values):
    SSE = sum((y - fitted_values)**2)
    SST = sum((y - y.mean())**2)
    r_square = 1- SSE/SST
    MSE = SSE/(data.shape[0] - data.shape[1])
    return r_square, MSE


def generate_output():
    data = load_data()
    x, y = x_y(data)
    beta = coefficients(x, y)
    y_hat = fitted_values(x, beta)
    r_square, MSE = model_summary(data, y, y_hat)

    f = open("output_py.txt", 'w')
    print("This is regression analysis", file=f)
    print("n", file=f)
    print("Coefficientsn--------------", file=f)
    print("Constant: ", beta[0], file=f)
    for i in range(x.shape[1] - 1):
        print("beta" + str(i + 1), ":", beta[i + 1], file=f)
    print("n", file=f)
    print(pd.DataFrame({'Actual values': y, 'Fitted values': y_hat}), file=f)
    print("n", file=f)
    print("Model Summaryn--------------", file=f)
    print("R-square = ", r_square, file=f)
    print("MSE = ", MSE, file=f)
    f.close()
    print("output_py.txt is generated!")

analysis = input('Choose between 1: regression and 2: classification (e.g.)2 : ')
if analysis == '1': generate_output() 


##### Linear/Quadratic/Regularized Dscrimination Analysis(LDA, QDA, RDA) #######

def load_data_class():
    """Assume dataset is a matrix without column names.
    Last column should be a target value(categorical) and the rest ones are features."""
    filename_train = input('Enter the training data file path  (e.g.)dataveh.dat : ')
    filename_test = input('Enter the test data file path  (e.g.)datavehtest.dat : ')
    format_select = input('Select the data coding format 1: [a b c] or 2: [a,b,c]  (e.g.)2 : ')
    format_set = { 1:' ', 2:','}
    data = np.genfromtxt(filename_train, delimiter=format_set[int(format_select)])
    test_data = np.genfromtxt(filename_test, delimiter=format_set[int(format_select)])
    return data, test_data

data, test_data = load_data_class()

#common hyperparameter
TOTAL_SIZE = data.shape[0]
TOTAL_SIZE_TEST = test_data.shape[0]
PARAMETER_DIM = data.shape[1]-1
LABELS = np.unique(data[:,-1])
STEPSIZE = 0.05 #RDA

#initial set for iteration
groupsize_set = []
mean_set = []
covariance_set = []
S_pooled = np.zeros((PARAMETER_DIM, PARAMETER_DIM)) #LDA, RDA
cov_RDA = [] #RDA

X_data = data[:,:-1]

# generate estimators
for label in LABELS:
    group_data = X_data[data[:,-1] == label]

    # group size and data dimension
    n = group_data.shape[0]
    groupsize_set.append(n)

    # sample mean
    x_bar = np.mean(group_data, axis=0)
    mean_set.append(x_bar)

    # covariance matrix
    S = np.zeros((PARAMETER_DIM, PARAMETER_DIM))
    for i in range(n):
        z = (group_data[i,:] - x_bar) * np.vstack(group_data[i,:] - x_bar)
        S = S + z         
    S_pooled = S_pooled + S
    S = S / (n-1)
    covariance_set.append(S)
    
S_pooled = S_pooled/(TOTAL_SIZE - PARAMETER_DIM)
OVERALL_VAR = np.mean(S_pooled.diagonal())


def LinearDiscriminantFunction(x):   
    LDA_values = []
    
    for i in range(len(mean_set)):
        arg1 = np.matmul(np.matmul(mean_set[i], np.linalg.inv(S_pooled)), x)
        arg2 = np.matmul(np.matmul(mean_set[i], np.linalg.inv(S_pooled)), mean_set[i].T) 
        LDA_values.append(arg1 - arg2/2)
    
    prediction = LABELS[np.array(LDA_values).argmax()].astype(int)
    
    return prediction


def QuadraticDiscriminantFunction(x):   
    QDA_values = []
    
    for i in range(len(mean_set)):
        arg1 = np.log(np.linalg.det(covariance_set[i]))
        arg2 = np.matmul(np.matmul((x-mean_set[i]), np.linalg.inv(covariance_set[i])), (x-mean_set[i]).T) 
        QDA_values.append(- arg1/2 - arg2/2)
    
    prediction = LABELS[np.array(QDA_values).argmax()].astype(int)
    
    return prediction


def RegularizedDiscriminantFunction(x, alpha, gamma):
    RDA_values = []
    
    for i in range(len(mean_set)):
        cov_RDA = alpha*covariance_set[i] + (1 - alpha)*(gamma*S_pooled + (1 - gamma)*OVERALL_VAR*np.identity(PARAMETER_DIM)) 
        arg1 = np.log(np.linalg.det(cov_RDA))
        arg2 = np.matmul(np.matmul((x-mean_set[i]), np.linalg.inv(cov_RDA)), (x-mean_set[i]).T) 
        RDA_values.append(- arg1/2 - arg2/2)

    prediction = LABELS[np.array(RDA_values).argmax()].astype(int)
    
    return prediction


def find_optimal_parameter(method = RegularizedDiscriminantFunction):
    a_g_accuracy_set = []
    for a in np.arange(0., 1.001, STEPSIZE):
        for g in np.arange(0., 1.001, STEPSIZE):
            accrcy = sum(test_data[:,-1] == [method(x, alpha = a, gamma = g) for x in test_data[:,:-1]])/TOTAL_SIZE_TEST 
            a_g_accuracy_set.append([a, g, accrcy])
    max_index = np.argmax(np.array(a_g_accuracy_set)[:,2])
    return (a_g_accuracy_set, a_g_accuracy_set[max_index])


fnc_name = input('Choose one among LDA, QDA, and RDA. Default is LDA. (e.g.)LDA : ')
if fnc_name == 'QDA':
    method = QuadraticDiscriminantFunction
elif fnc_name == 'LDA':
    method = LinearDiscriminantFunction
elif fnc_name == 'RDA':
    method = RegularizedDiscriminantFunction
    alpha_gamma_accuracy, optimalval = find_optimal_parameter()
else:
    print("You provided a wrong input. Run file again.")
        

df = pd.DataFrame({'Actual class' : data[:,-1].astype(int)}) # resubstitution
df_test = pd.DataFrame({'Actual class' : test_data[:,-1].astype(int)}) # test

if fnc_name == 'QDA' or fnc_name == 'LDA':
    df['Resub prediction'] = [method(x) for x in X_data]
    df_test['Test prediction'] = [method(x) for x in test_data[:,:-1]]
elif fnc_name == 'RDA':
    df['Resub prediction'] = [method(x, alpha = optimalval[0], gamma = optimalval[1]) for x in X_data]
    df_test['Test prediction'] = [method(x, alpha = optimalval[0], gamma = optimalval[1]) for x in test_data[:,:-1]]    

confusion_table = pd.crosstab(index=df['Actual class'], columns=df['Resub prediction'])
overall_accuracy = sum(confusion_table.loc[i,i] for i in confusion_table)/TOTAL_SIZE

confusion_table_test = pd.crosstab(index=df_test['Actual class'], columns=df_test['Test prediction'])
overall_accuracy_test = sum(confusion_table_test.loc[i,i] for i in confusion_table_test)/TOTAL_SIZE_TEST

      
def generate_output_class():
    f = open("output_py.txt", 'w')
    print("This is classification analysis", file=f)
    print("The method used here is", fnc_name, "n--------------", file=f)
    print("n", file=f)
    print(df, file=f)
    print("n", file=f)
    print("Confusion Matrix (Resubstitution)n--------------", file=f)
    print(confusion_table, file=f)
    print("n", file=f)
    print("Model Summary (Resubstitution)n--------------", file=f)
    print("Overall accuracy = ",overall_accuracy, file=f)
    print("n", file=f)
    
    print(df_test, file=f)
    print("n", file=f)
    print("Confusion Matrix (Test)n--------------", file=f)
    print(confusion_table_test, file=f)
    print("n", file=f)
    print("Model Summary (Test)n--------------", file=f)
    print("Overall accuracy = ",overall_accuracy_test, file=f)
    print("n", file=f)    
    f.close()
    print("output_py.txt is generated!")

    
def draw_plot():
    print("The method used here is", fnc_name)
    print("The test error is minimized near alpha =", round(optimalval[0],2), "gamma =", round(optimalval[1],2))
    X1, X2, Y = [np.array(alpha_gamma_accuracy)[:,i] for i in range(3)]
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(X1, X2, Y)
    plt.show()
    pass


generate_output_class()
if fnc_name == 'RDA':
    draw_plot()
   















Choose between 1: regression and 2: classification (e.g.)2 : 2
Enter the training data file path  (e.g.)dataveh.dat : veh.dat
Enter the test data file path  (e.g.)datavehtest.dat : vehtest.dat
Select the data coding format 1: [a b c] or 2: [a,b,c]  (e.g.)2 : 2
Choose one among LDA, QDA, and RDA. Default is LDA. (e.g.)LDA : RDA
output_py.txt is generated!
The method used here is RDA
The test error is minimized near alpha = 0.95 gamma = 1.0


<Figure size 640x480 with 1 Axes>

In [4]:
S_pooled[0,0]

63.010469915015356

In [6]:
S[0,0]

15.967171717171698