In [2]:
# Import necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils import shuffle
from sklearn.model_selection import KFold

np.set_printoptions(suppress = True)

import warnings
warnings.simplefilter(action='ignore')


# Importing data sets
from data_import import load_fruits, load_chess, load_music, load_lepiota

fruit_X, fruit_Y = load_fruits()
chess_X, chess_Y = load_chess()
music_X, music_Y = load_music()
lep_X, lep_Y = load_lepiota()

Fruit feature space: (10000, 4)
Fruit label space (10000,)
Chess feature space: (20058, 373)
Chess label space (20058,)
Music feature space: (200, 28)
Music label space (200,)
Lepiota feature space: (8124, 46)
Lepiota label space: (8124,)


## Logistic Regression Classifier

<hr style="color:Maroon;background-color:Maroon;border:0 none; height: 3px;">

In [None]:
# Helper function to truncate sigmoid inputs
def truncate(val):
    return np.maximum(-50, np.minimum(val, 50))

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Incurs error for incorrect predictions
def judge(a, b):
    if a != b:
        return 1
    else:
        return 0
    
# Generates prediction based on probability outcome from sigmoid    
def logistic(x, W, b, thresh):
    if sigmoid(W.T @ x + b) >= thresh:
        return 1
    else:
        return -1

# Calculate error given feature vectors X and labels Y
def calc_error(X, Y, W, b, thresh = 0.5):
    e = np.array([])
    preds = np.array([])
    for xi, yi in zip(X, Y):
        pred = logistic(xi, W, b, thresh)
        preds = np.append(preds, pred)
        e = np.append(e, judge(pred, yi))
    acc = np.count_nonzero(e == 0) / e.shape[0]
    e = e.mean()
    
    return e, acc

In [None]:
# Logistic loss
def L_W_b(X, Y, W, b):
    loss = -1 * np.ones(X.shape[0]) * np.log(sigmoid(truncate(Y * (X @ W + (b * np.ones(X.shape[0]))))))
    return loss

# Gradient for logistic loss
def grad_L_W_b(X, Y, W, b):

    grad_W = -X.T @ ((1 - sigmoid(Y * (X @ W + (b * np.ones(X.shape[0]))))) * Y)
    grad_b = -1 * np.ones(X.shape[0]).T @ ((1 - sigmoid(Y * (X @ W + (b * np.ones(X.shape[0]))))) * Y)

    return grad_W, grad_b

### Model Fitting

In [None]:
# Logistic regression learning algorithm; gradient descent
def fit_log_reg(X, Y, iterations = 5000):
    losses = []           # Error history
    lam = 0.001           # Learning rate/step-size, fixed
    W = np.zeros(X.shape[1])
    b = 0.0
    
    for i in range(iterations):
        grad_W, grad_b = grad_L_W_b(X, Y, W, b)
        
        W = W - lam * grad_W
        b = b - lam * grad_b

        # Tracking training losses
        cur_loss = L_W_b(X, Y, W, b).sum()
        losses.append(cur_loss)
    
    return W, b, losses

###  Training Function

In [None]:
# Generating average training accuracy for 20/80 split

def train_log_reg(X, y, size):
    avg_e = 0
    avg_acc = 0

    for i in np.arange(3):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = size, shuffle = True)
        W, b, losses = fit_log_reg(X_train, y_train)
        e, acc = calc_error(X_train, y_train, W, b)
        avg_e += e
        avg_acc += acc
        
    print(avg_acc / 3)

### Cross-Validation and Testing Functions

In [None]:
# Combines labels (Y) as another column to feature set (X)
def join_data(X, Y):
    return np.hstack((X, Y.reshape(Y.shape[0], 1)))

def tune_threshold(X, y):
    
    # Joining and splitting data
    data = join_data(X, y)    
    kf = KFold(n_splits = 5, shuffle = True)
    thresh_list = np.arange(0, 1.05, 0.05)
    
    
    # Assessing possible thresholds at 0.01 increments
    for thresh in thresh_list:
        val_accs = np.array([])
        print('Testing threshold: {}'.format(thresh))
        for trial in np.arange(3):
            accs = np.array([])
            errs = np.array([])

            for train_index, test_index in kf.split(data):
                W, b, losses = fit_log_reg(data[train_index][:, :data.shape[1] - 1], data[train_index][:, -1])
                e, acc = calc_error(data[test_index][:, :data.shape[1] - 1], data[test_index][:, -1], W, b, thresh)
                accs = np.append(accs, acc)
            val_accs = np.append(val_accs, accs.mean())
        if val_accs.mean() > best_acc:
            best_thresh = thresh
            best_acc = val_accs.mean()

    print('Best threshold: {}'.format(best_thresh))
    return round(best_thresh, 2)

<hr style="color:Maroon;background-color:Maroon;border:0 none; height: 3px;">

### Generating Training Performances

In [None]:
train_log_reg(fruit_X, fruit_Y, 0.8)
train_log_reg(fruit_X, fruit_Y, 0.5)
train_log_reg(fruit_X, fruit_Y, 0.2)

train_log_reg(chess_X, chess_Y, 0.8)
train_log_reg(chess_X, chess_Y, 0.5)
train_log_reg(chess_X, chess_Y, 0.2)

train_log_reg(music_X, music_Y, 0.8)
train_log_reg(music_X, music_Y, 0.5)
train_log_reg(music_X, music_Y, 0.2)

train_log_reg(lep_X, lep_Y, 0.8)
train_log_reg(lep_X, lep_Y, 0.5)
train_log_reg(lep_X, lep_Y, 0.2)

## 20/80 Splits for Each Dataset

In [None]:
# Fruits
fruit_X_train20, fruit_X_test80, fruit_Y_train20, fruit_Y_test80 = train_test_split(fruit_X, fruit_Y, test_size = 0.8, shuffle = True)

opt_thresh = tune_threshold(fruit_X_train20, fruit_Y_train20)
W, b, losses = fit_log_reg(fruit_X_train20, fruit_Y_train20)
e, f_acc = calc_error(fruit_X_test80, fruit_Y_test80, W, b, opt_thresh)

# Chess
chess_X_train20, chess_X_test80, chess_Y_train20, chess_Y_test80 = train_test_split(chess_X, chess_Y, test_size = 0.8, shuffle = True)

opt_thresh = tune_threshold(chess_X_train20, chess_Y_train20)
W, b, losses = fit_log_reg(chess_X_train20, chess_Y_train20)
e, c_acc = calc_error(chess_X_test80, chess_Y_test80, W, b, opt_thresh)

# Music
music_X_train20, music_X_test80, music_Y_train20, music_Y_test80 = train_test_split(music_X, music_Y, test_size = 0.8, shuffle = True)

opt_thresh = tune_threshold(music_X_train20, music_Y_train20)
W, b, losses = fit_log_reg(music_X_train20, music_Y_train20)
e, m_acc = calc_error(music_X_test80, music_Y_test80, W, b, opt_thresh)

# Lepiota
lep_X_train20, lep_X_test80, lep_Y_train20, lep_Y_test80 = train_test_split(lep_X, lep_Y, test_size = 0.8, shuffle = True)

opt_thresh = tune_threshold(lep_X_train20, lep_Y_train20)
W, b, losses = fit_log_reg(lep_X_train20, lep_Y_train20)
e, l_acc = calc_error(lep_X_test80, lep_Y_test80, W, b, opt_thresh)

## 50/50 Splits for Each Dataset

In [None]:
# Fruits
fruit_X_train50, fruit_X_test50, fruit_Y_train50, fruit_Y_test50 = train_test_split(fruit_X, fruit_Y, test_size = 0.5, shuffle = True)

opt_thresh = tune_threshold(fruit_X_train50, fruit_Y_train50)
W, b, losses = fit_log_reg(fruit_X_train50, fruit_Y_train50)
e, f_acc = calc_error(fruit_X_test50, fruit_Y_test50, W, b, opt_thresh)

# Chess
chess_X_train50, chess_X_test50, chess_Y_train50, chess_Y_test50 = train_test_split(chess_X, chess_Y, test_size = 0.5, shuffle = True)

opt_thresh = tune_threshold(chess_X_train50, chess_Y_train50)
W, b, losses = fit_log_reg(chess_X_train50, chess_Y_train50)
e, c_acc = calc_error(chess_X_test50, chess_Y_test50, W, b, opt_thresh)

# Music
music_X_train50, music_X_test50, music_Y_train50, music_Y_test50 = train_test_split(music_X, music_Y, test_size = 0.5, shuffle = True)

opt_thresh = tune_threshold(music_X_train50, music_Y_train50)
W, b, losses = fit_log_reg(music_X_train50, music_Y_train50)
e, m_acc = calc_error(music_X_test50, music_Y_test50, W, b, opt_thresh)

# Lepiota
lep_X_train50, lep_X_test50, lep_Y_train50, lep_Y_test50 = train_test_split(lep_X, lep_Y, test_size = 0.5, shuffle = True)

opt_thresh = tune_threshold(lep_X_train50, lep_Y_train50)
W, b, losses = fit_log_reg(lep_X_train50, lep_Y_train50)
e, l_acc = calc_error(lep_X_test50, lep_Y_test50, W, b, opt_thresh)

## 80/20 Splits for Each Dataset

In [None]:
# Fruits
fruit_X_train80, fruit_X_test20, fruit_Y_train80, fruit_Y_test20 = train_test_split(fruit_X, fruit_Y, test_size = 0.2, shuffle = True)

opt_thresh = tune_threshold(fruit_X_train80, fruit_Y_train80)
W, b, losses = fit_log_reg(fruit_X_train80, fruit_Y_train80)
e, f_acc = calc_error(fruit_X_test20, fruit_Y_test20, W, b, opt_thresh)

# Chess
chess_X_train80, chess_X_test20, chess_Y_train80, chess_Y_test20 = train_test_split(chess_X, chess_Y, test_size = 0.2, shuffle = True)

opt_thresh = tune_threshold(chess_X_train80, chess_Y_train80)
W, b, losses = fit_log_reg(chess_X_train80, chess_Y_train80)
e, c_acc = calc_error(chess_X_test20, chess_Y_test20, W, b, opt_thresh)

# Music
music_X_train80, music_X_test20, music_Y_train80, music_Y_test20 = train_test_split(music_X, music_Y, test_size = 0.2, shuffle = True)

opt_thresh = tune_threshold(music_X_train80, music_Y_train80)
W, b, losses = fit_log_reg(music_X_train80, music_Y_train80)
e, m_acc = calc_error(music_X_test20, music_Y_test20, W, b, opt_thresh)

# Lepiota
lep_X_train80, lep_X_test20, lep_Y_train80, lep_Y_test20 = train_test_split(lep_X, lep_Y, test_size = 0.2, shuffle = True)

opt_thresh = tune_threshold(lep_X_train80, lep_Y_train80)
W, b, losses = fit_log_reg(lep_X_train80, lep_Y_train80)
e, l_acc = calc_error(lep_X_test20, lep_Y_test20, W, b, opt_thresh)