In [13]:
from sklearn.metrics.pairwise import pairwise_distances
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import *
import numpy as np
import math

# Import models
from implementation import LogisticRegression2Class, LogisticRegressionMultiClass

## 2-Class Logistic Regression

In [11]:
"""
Processing Data
"""
# Retrieve data
data = pd.read_csv('data_banknote_authentication.txt',header=None).values

# Shuffle and split into train/test split
data = shuffle(data,random_state=0)
l = data.shape[1]
X_train, X_test, y_train, y_test = train_test_split(data[:,:(l-1)],data[:,-1],test_size=0.2, random_state=42)

In [12]:
"""
Perform KFold cross validation to validate and choose model
"""
kf = KFold(n_splits=5)
degrees = [1,2,3]
print("Performing 5-fold cross validation across 3 different models:")

average_test_f1 = []
average_train_f1 = []

for degree in degrees:
    train_f1 = []
    test_f1 = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X_train):
        # Split to train and test for this fold
        X_train_kfold, X_val = X_train[train_index], X_train[val_index]
        y_train_kfold, y_val = y_train[train_index], y_train[val_index]

        # Create polynomial features
        if degree != 1:
            poly_reg = PolynomialFeatures(degree=degree)
            X_train_poly = poly_reg.fit_transform(X_train_kfold)
            X_test_poly = poly_reg.fit_transform(X_val)
        else:
            X_train_poly = X_train_kfold
            X_test_poly = X_val

        # Train 2-class Logistic Regression
        lr = LogisticRegression2Class()
        lr.fit(X_train_poly,y_train_kfold,epochs=30,learning_rate=1e-4)

        # Predict
        train_pred = lr.predict(X_train_poly)
        test_pred = lr.predict(X_test_poly)

        # Save the f1 of this fold
        train_f1.append(f1_score(train_pred,y_train_kfold))
        test_f1.append(f1_score(test_pred,y_val))

    # Average the f1 across 5 fold for each mapping
    average_test_f1.append(np.average(test_f1))
    average_train_f1.append(np.average(train_f1))
    
for i, (avg_train_f1, avg_test_f1) in enumerate(zip(average_train_f1,average_test_f1)):
    if degrees[i] == 1:
        print(f"Logistic Regression with original features:")
    else:
        print(f"Logistic Regression with Degree {degrees[i]} polynomial mapping of features:")
    print(f"\tAverage training f1 score: {avg_train_f1}")
    print(f"\tAverage testing f1 score: {avg_test_f1}")

Performing 5-fold cross validation across 3 different models:
Logistic Regression with original features:
	Average training f1 score: 0.8590160986821302
	Average testing f1 score: 0.8277957926131062
Logistic Regression with Degree 2 polynomial mapping of features:
	Average training f1 score: 0.8944071472551494
	Average testing f1 score: 0.9115824172132593
Logistic Regression with Degree 3 polynomial mapping of features:
	Average training f1 score: 0.9577456723177298
	Average testing f1 score: 0.9613551343884282


  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))))
  J_theta = -(np.sum((y*np.log(h)) + ((1-y)*np.log(1-h))

# K-class LR

In [26]:
"""
Processing data
"""
# Image data vectorized
train_data = pd.read_csv('fashion-mnist_train.csv')
test_data = pd.read_csv('fashion-mnist_test.csv')

# Convert to matrices
X_train = train_data.iloc[:,1:].values
y_train = train_data.label.values
X_test = test_data.iloc[:,1:].values
y_test = test_data.label.values

# Normalize pixels
X_train_normalized = X_train/255
X_test_normalized = X_test/255

# One hot encode labels
ohe = OneHotEncoder()
ohe.fit(y_train.reshape(-1,1))
y_train_ohe, y_test_ohe = ohe.transform(y_train.reshape(-1,1)), ohe.transform(y_test.reshape(-1,1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [42]:
"""
KFold
"""
# Perform KFold cross validation to validate model
# Can use accuracy since the dataset is balanced
num_class = len(np.unique(y_train))
kf = KFold(n_splits=5)
degrees = [1]

average_test_accuracies = []
average_train_accuracies = []

for degree in degrees:
    train_accuracy = []
    test_accuracy = []
    # Start KFold for current degree of polynomial
    for train_index, val_index in kf.split(X_train_normalized):
        # Split to train and test for this fold
        X_train_kfold, X_val = X_train_normalized[train_index], X_train_normalized[val_index]
        y_train_kfold_ohe, y_val = y_train_ohe[train_index].toarray(), y_train[val_index]
        y_train_kfold = y_train[train_index]
        # Create polynomial features
        if degree != 1:
            poly_reg = PolynomialFeatures(degree=degree)
            X_train_poly = poly_reg.fit_transform(X_train_kfold)
            X_test_poly = poly_reg.fit_transform(X_val)
        else:
            X_train_poly = X_train_kfold
            X_test_poly = X_val

        # Train multi-class Logistic Regression
        lr = LogisticRegressionMultiClass(num_class)
        lr.fit(X_train_poly,y_train_kfold_ohe,epochs=200,learning_rate=1e-4)

        # Predict
        train_pred = lr.predict(X_train_poly)
        test_pred = lr.predict(X_test_poly)

        # Save the accuracy of this fold
        train_accuracy.append(accuracy_score(train_pred,y_train_kfold))
        test_accuracy.append(accuracy_score(test_pred,y_val))

    # Average the accuracy across 5 fold for each mapping
    average_test_accuracies.append(np.average(test_accuracy))
    average_train_accuracies.append(np.average(train_accuracy))
    
for i, (average_train_accuracy, average_test_accuracy) in enumerate(zip(average_train_accuracies,average_test_accuracies)):
    if degrees[i] == 1:
        print(f"Logistic Regression with original features:")
    else:
        print(f"Logistic Regression with Degree {degrees[i]} polynomial mapping of features:")
    print(f"\tAverage training accuracy score: {average_train_accuracy*100}")
    print(f"\tAverage testing accuracy score: {average_test_accuracy*100}")

  # Sum from numpy will sum across all axis
  # Sum from numpy will sum across all axis


Logistic Regression with original features:
	Average training accuracy score: 74.89291666666666
	Average testing accuracy score: 74.87333333333333


In [40]:
num_class = len(np.unique(y_train))
lr = LogisticRegressionMultiClass(num_class)
lr.fit(X_train_normalized,y_train_ohe.toarray(),epochs=200,learning_rate=1e-4)
print(f"Accuracy score on final test set: {accuracy_score(y_test,lr.predict(X_test_normalized))*100}")

Accuracy score on final test set: 76.23
