In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
from sklearn import svm

# Load Test and Train Data

In [None]:
file = open("trimmed_test.csv")
file.readline()
data = np.loadtxt(file, delimiter =",")

file = open("trimmed_train.csv")
file.readline()
data2 = np.loadtxt(file, delimiter =",")

test_Y = data[:,0]
test_X = data[:, 0:]

train_Y = data2[:,0]
train_X = data2[:, 0:]

print(data)

# Logistic Regression

In [None]:
def log_regression(X_train, y_train, X_test,y_test,C=1e5, multiclass="multinomial",solver="lbfgs",penalty="l2"):
    model = LogisticRegression(C=C, multi_class=multiclass, solver=solver, penalty=penalty)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test) 
    y_pred_train = model.predict(X_train)
    return  [ metrics.mean_squared_error(y_test, y_pred),  metrics.mean_squared_error(y_train, y_pred_train)]
    #return  [1-accuracy_score(y_test, y_pred), 1-accuracy_score(y_train, y_pred_train)]
    
    

# Support Vector Machines

In [None]:
def svm_model(X_train, y_train, X_test, y_test, C=1e5, kernel='linear',gamma='scale'):
    clf = svm.SVC(C=C, kernel= kernel, gamma=gamma)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred_train = clf.predict(X_train)
    return  [ metrics.mean_squared_error(y_test, y_pred),  metrics.mean_squared_error(y_train, y_pred_train)]

# K Fold Cross Validation Helper Methods

In [None]:
#split data and return a list of size k * 2 that has the test and train datasets split out
def k_fold_split(X_train, Y_train, k):
    
    # divide X into k parts
    # loop through
    result = []
    split =int(len(X_train)/k)
    count = 0
    for i in range(0, k):
        data = X_train[count:split*(i+1)]
        target = Y_train[count:split*(i+1)]
        count += split
        add_to_result = {"X":data, "Y":target}
        result.append(add_to_result)
        
    return result

In [None]:
def cal_average(list):
    print(list)
    total = 0
    for i in range(0, len(list)):
        total += list[i]
    return total/len(list)

In [None]:
#Using k-fold cross validation with logistic regression
def cross_val_log_regression(train_x, train_y, test_x, test_y, C=1e5, k=2):
    error_scores = []
    result = k_fold_split(train_X, train_Y, k)
    train_x = []
    train_y = []
    for i in range(0, len(result)):
        test_x = result[i].get("X")
        test_y = result[i].get("Y")
        for j in range(0, len(result)):
            if(j!= i):
                train_x.extend(result[j].get("X"))
                train_y.extend(result[j].get("Y"))
        reg_result = log_regression(train_x, train_y, test_x, test_y, C=C)
        error_scores.append(reg_result[0])
    cross_val_result = cal_average(error_scores)
    return cross_val_result

In [None]:
#using k-fold cross validation with svms
def cross_val_svm(train_X, train_Y, C=1e5, kernel='linear', k=2, gamma='scale'):
    error_scores = []
    result = k_fold_split(train_X, train_Y, k)
    train_x = []
    train_y = []
    #print(result)
    for i in range(0, len(result)):
        test_x = result[i].get("X")
        test_y = result[i].get("Y")
        for j in range(0, len(result)):
            if(j!= i):
                train_x.extend(result[j].get("X"))
                train_y.extend(result[j].get("Y"))
        reg_result = svm_model(train_x, train_y, test_x, test_y, kernel=kernel, C=C, gamma=gamma)
        error_scores.append(reg_result[0])
    cross_val_result = cal_average(error_scores)
    return cross_val_result

# First ...


In [None]:
test_size_x = []
test_size_score = []
train_size_score = []

for i in range(1,10):
    c = (0.1)*(0.15**i)
    nResult = log_regression(train_X, train_Y,test_X, test_Y, C=c)
    print(nResult)
    test_size_x.append(i)
    test_size_score.append(nResult[0]*100)
    train_size_score.append(nResult[1]*100)
plt.clf()
plt.plot(test_size_x, train_size_score, 'b')
plt.plot(test_size_x, test_size_score, 'r')
plt.ylabel('Error (percentage)')
plt.xlabel('C = (0.1*(0.15^x))')
plt.show()
    

# Second...


In [None]:
test_size_x = []
test_size_score = []
train_size_score = []

for i in range(1,10):
    C = (0.1)*(0.15**i)
    svmResult = svm_model(train_X, train_Y, test_X, test_Y, C=C, kernel = 'linear')
    test_size_x.append(i)
    test_size_score.append(svmResult[0]*100)
    train_size_score.append(svmResult[1]*100)
plt.clf()
plt.plot(test_size_x, train_size_score, 'b')
plt.plot(test_size_x, test_size_score, 'r')
plt.ylabel('Error (percentage)')
plt.xlabel('C = (0.1*(0.15^x))')
plt.show()
    

# Third...


In [None]:
test_size_x = []
test_size_score = []
train_size_score = []

for i in range(1,10):
    C=(0.1)*(0.1**i)
    result = cross_val_log_regression(train_X, train_Y, test_X, test_Y, C=C, k=5)
    test_size_x.append(i)
    train_size_score.append(result*100)
plt.clf()
plt.plot(test_size_x, train_size_score, 'b')
plt.ylabel('Error (percentage)')
plt.xlabel('x in C = (0.1*(0.15^x))')
plt.show()


In [None]:
test_size_x = []
test_size_score = []
train_size_score = []

for i in range(1,10):
    C=(0.1)*(0.1**i)
    svmResult = cross_val_svm(train_X, train_Y, test_X, test_Y, C=C, k=5)
    test_size_x.append(i)
    train_size_score.append(svmResult*100)
plt.clf()
plt.plot(test_size_x, train_size_score, 'b')
plt.ylabel('Error (percentage)')
plt.xlabel('C')
plt.show()

In [None]:
print(svm_model(train_X, train_Y, test_X, test_Y, C=(0.1)*(0.1**3)))
print(log_regression(train_X, train_Y, test_X, test_Y, C=(0.1)*(0.1**i)))


# Fourth


In [None]:
gamma_vals = [ 0.00000005,0.00000001, 0.0000001, 0.00001, 0.00005]

gamma_vals.sort()
for i in range(0,len(gamma_vals)):
    test_size_x = []
    title = "Gamma: " +str(gamma_vals[i])
    test_size_score = []
    train_size_score = []
    for j in range(1,10):
        C = 1.1*(2.5**j)
        svmResult = cross_val_svm(train_X, train_Y, C=C, k=5,gamma=gamma_vals[i], kernel='rbf')
        test_size_x.append(j)
        train_size_score.append(svmResult*100)
    plt.clf()
    plt.plot(test_size_x, train_size_score, 'b')

    plt.suptitle(title)
    plt.ylabel('Error (percentage)')
    plt.xlabel('X')
    plt.show()

In [None]:
test_size_x = []
test_size_score = []
train_size_score = []
C_vals = [5, 5, 8, 5, 3]


for i in range(0,len(gamma_vals)):
    print('GAMMA')
    print(gamma_vals[i])
    C=0.1*(2.5**C_vals[i])
    svmResult = svm_model(train_X, train_Y, test_X, test_Y, gamma=gamma_vals[i],kernel='rbf')
    test_size_x.append(i)
    test_size_score.append((svmResult[0]*100))
    train_size_score.append((svmResult[1]*100))
    print(test_size_x)
    print(test_size_score)
    print(train_size_score)
plt.clf()
plt.plot(test_size_x, train_size_score, 'bo')
plt.plot(test_size_x, test_size_score, 'ro')

plt.suptitle("test and training error of gamma")
plt.ylabel('Error (percentage)')
plt.xlabel('Gamma array index')
plt.show()

