In [178]:
import numpy as np
import csv
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal

In [189]:
def retrieve_train_data(ith_fold):
    with open('knn/data'+str(ith_fold)+'.csv', 'r') as data_file:
        data = list(csv.reader(data_file))
    with open('knn/labels'+str(ith_fold)+'.csv', 'r') as label_file:
        label = list(csv.reader(label_file))
        return data, label

class Data:
    def __init__(self,num_fold):
        grand_data,grand_label=[],[]
        for i in range(num_fold):
            data,label=retrieve_train_data(i+1)
            grand_data+=data
            grand_label+=label
        self.grand_data=np.array(grand_data,dtype=np.int8)
        self.grand_label=np.array(grand_label,dtype=np.int8).ravel()
        self.batch_d=self.grand_data.shape[0]//num_fold
    def get_data_test(self,i):
        return self.grand_data[np.arange(self.batch_d)+i*self.batch_d]
    def get_label_test(self,i):
        return self.grand_label[np.arange(self.batch_d)+i*self.batch_d]
    def get_data_train(self,i):
        return np.delete(self.grand_data,np.arange(self.batch_d)\
                         +i*self.batch_d,axis=0)
    def get_label_train(self,i):
        return np.delete(self.grand_label,np.arange(self.batch_d)\
                         +i*self.batch_d,axis=0)
class MoG:
    def train(self,data,label):
        is5=(label == 5)
        self.pi5=is5.sum() / data.shape[0]
        self.pi6=1-self.pi5
        data_5=np.empty([0,data.shape[1]])
        data_6=np.empty([0,data.shape[1]])
        for i in range(data.shape[0]):
            if (is5[i]==1):
                data_5=np.append(data_5,[data[i]],axis=0)
            else:
                data_6=np.append(data_6,[data[i]],axis=0)
        self.mu_5=np.mean(data_5,axis=0)
        self.mu_6=np.mean(data_6,axis=0)
        data_5c=data_5-self.mu_5
        data_6c=data_6-self.mu_6
        S5=np.matmul(data_5c.transpose(),data_5c)
        S6=np.matmul(data_6c.transpose(),data_6c)
        self.Sigma=(S5+S6) / data.shape[0]
    def predict(self,x):
        mn5=multivariate_normal.pdf(x,mean=self.mu_5,cov=self.Sigma)
        mn6=multivariate_normal.pdf(x,mean=self.mu_6,cov=self.Sigma)
        if (self.pi5*mn5 > self.pi6*mn6):
            return 5
        else:
            return 6
    def test(self,data,label):
        return np.sum(np.equal(np.apply_along_axis(self.predict,1,data),label))/data.shape[0]
        
def Sigmoid(w,x):
        return 1 / (1 + np.exp(-w.dot(x)))

class LR: #hard coded to the case where it is 5 (not 6)
    def train(self,data,label,num_it):
        data=np.insert(data,0,1,axis=1)
        label=(label==5)
        self.w=np.zeros(data.shape[1])
        self.R=np.zeros([data.shape[0],data.shape[0]])
        for i in range(num_it):
            L_1=0
            for j in range(data.shape[0]):
                L_1+=(Sigmoid(self.w,data[j])-label[j])*data[j]
                self.R[j][j]=Sigmoid(self.w,data[j])*(1-Sigmoid(self.w,data[j]))
            H=np.matmul(np.matmul(data.transpose(),self.R),data)
            self.w-=np.matmul(np.linalg.pinv(H),L_1)
    def predict(self,x):
        return (Sigmoid(self.w,x) > 0.5)
    def test(self,data,label):
        data=np.insert(data,0,1,axis=1)
        label=(label==5)
        return np.sum(np.equal(np.apply_along_axis(self.predict,1,data),label))/data.shape[0]
        
def Q1():
    num_fold=10
    accuracy_lst=[]
    DATA=Data(num_fold)
    for i in range(num_fold):
        M=MoG()
        M.train(DATA.get_data_train(i),DATA.get_label_train(i))
        acc=M.test(DATA.get_data_test(i),DATA.get_label_test(i))
        accuracy_lst.append(acc)
    print("Q1")
    print("Accuracy: {}".format(np.mean(accuracy_lst)))

def Q2():
    num_fold=10
    accuracy_lst=[]
    DATA=Data(num_fold)
    for i in range(num_fold):
        L=LR()
        L.train(DATA.get_data_train(i),DATA.get_label_train(i),10)
        acc=L.test(DATA.get_data_test(i),DATA.get_label_test(i))
        accuracy_lst.append(acc)
    print("Q2")
    print("Accuracy: {}".format(np.mean(accuracy_lst)))
    L.train(DATA.grand_data,DATA.grand_label,10)
    print("Vector w")
    print(L.w)
if __name__=="__main__":
    Q1()
    Q2()

Q1
Accuracy: 0.8675675675675677
Q2
Accuracy: 0.863063063063063
Vector w
[ 1.34693295e-01 -2.95016036e-02  1.42379140e-02  3.48930440e-02
  1.52217181e-02  4.42212544e-02  1.82674123e-01  3.78298196e-02
  6.30007348e-02  9.39523892e-02 -4.72307408e-03  1.04525881e-01
  1.67596878e-02  2.59975589e-02  1.92852432e-01  3.45685864e-03
 -5.14217586e-02  5.28535920e-02  7.52766694e-02 -7.37422660e-03
 -1.98994419e-02  6.89265131e-03 -9.68883208e-02 -2.20082922e-02
  5.21423842e-02  6.08412098e-02  1.92349015e-02 -1.30316116e-02
  3.17330368e-02  8.94940252e-02 -3.81283062e-03 -1.01410064e-01
  2.70688864e-02  1.34839872e-01 -2.18704999e-02 -6.84437721e-02
 -3.24198044e-02 -1.67515348e-02 -7.02219612e-02 -4.06115529e-02
 -5.79807551e-02 -2.24190448e-02 -1.40471238e-02 -2.30941396e-01
 -9.26106434e-02  6.19529070e-03 -9.13217085e-05  1.40258620e-02
 -9.32931714e-02  1.19996489e-02 -1.50596723e-01 -8.91628888e-02
 -7.42472890e-02  6.32482144e-02 -3.47387718e-02 -8.47732301e-02
 -3.01988102e-04 -