In [15]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np
from sklearn.metrics import log_loss

In [16]:
# Calculate probability matrix
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T))))

In [17]:
# Cost function
def cost(X, y, theta):
    y1 = hypothesis(theta, X)
    return -(1/len(X)) * np.sum(y * np.log(y1) + (1 - y) * np.log(1 - y1))

In [18]:
# Drop all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    df.drop(columns=['Index'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        if column == 'Hogwarts House':
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [19]:
log_loss_results = []
# Perform gradient descent to update every value in theta (13x4 matrix)
def gradient_descent(X, y, theta, L, epochs):
    n = len(X)
    theta = pd.DataFrame(theta)
    for i in range(0, epochs):
        output_arr = []
        # j represents each Hogwarts course
        for j in range(0, theta.shape[1]):
            # [:,j] takes the j'th column of the theta 2d array
            h = hypothesis(theta.iloc[:,j], X)
            # k stands for each Hogwarts house
            for k in range(0, theta.shape[0]):
                theta.iloc[k, j] -= (L/n) * np.sum((h - y.iloc[:, j]) * X.iloc[:, k])
            output_arr.append(h)
        output_arr = pd.DataFrame(output_arr)
        log_loss_results.append(log_loss(y, output_arr.T))
    
    return theta

In [20]:
# Read training dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_train.csv")
except:
    print(f"{colors().RED}Error: could not read file{colors().END}")
    exit()
drop_columns(df)
df

Unnamed: 0,Hogwarts House,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,Ravenclaw,58384.0,-487.886086,5.727180,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,Slytherin,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,Ravenclaw,23702.0,-366.076117,7.725017,3.660761,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,Gryffindor,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.014040,-256.84675,200.64
4,Gryffindor,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.264070,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,49009.0,354.280086,-4.541837,-3.542801,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-0.649983,-250.39401,185.83
1596,Slytherin,63296.0,367.531174,6.061064,-3.675312,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-0.333962,-246.42719,44.80
1597,Gryffindor,63905.0,544.018925,-3.203269,-5.440189,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,0.071569,-251.63679,198.47
1598,Hufflepuff,82713.0,453.676219,3.442831,-4.536762,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-0.531875,-246.19072,-76.81


In [21]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc': # https://stackoverflow.com/a/38185438
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # Make missing data = mean
        if column != "Hogwarts House":
            if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
                df.iloc[i, df.columns.get_loc(column)] = float(df[column].mean()) # MAKE THIS MEAN???

In [22]:
# Split into input (df_no_house, later -> X) and output (y) variables
df_no_house = df.drop(columns=["Hogwarts House"])
y = df["Hogwarts House"]

In [23]:
df_no_house

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,0.653604,0.707791,0.112971,-0.390844,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [24]:
# House probability matrix y1
y1 = np.zeros([df_no_house.shape[0], len(y.unique())])
y1 = pd.DataFrame(y1)
y1

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
1595,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0
1597,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0


In [25]:
# Fill probability matrix with '1' for corresponding house
for i in range(0, len(y.unique())):
    for j in range(0, len(y1)):
        if y[j] == y.unique()[i]:
            y1.iloc[j, i] = 1
        else:
            y1.iloc[j, i] = 0
y1.head()

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [26]:
y1.shape

(1600, 4)

In [27]:
# Add a bias column
X = pd.concat([pd.Series(1, index=df_no_house.index, name='bias'), df_no_house], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,1,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,1,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,1,0.653604,0.707791,0.112971,-0.390844,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,1,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,1,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,1,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [None]:
# Create theta matrix
theta = np.zeros([df_no_house.shape[1]+1, y1.shape[1]])
theta = pd.DataFrame(theta)

type(theta.iloc[:,0])
theta = gradient_descent(X, y1, theta, 0.3, 4000) # CHANGE VALUES HERE
theta

  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np.exp(-(np.dot(theta, X.T))))
  return 1 / (1 + np

In [None]:
# Export thetas to file
np.savetxt('theta.txt', theta)

In [None]:
# Plot loss function
plt.plot(log_loss_results)
plt.xlabel('epochs')
plt.ylabel('log_loss')
plt.show()

In [None]:
# GET ACCURACY LEVEL FOR TESTING

In [127]:
# Calculate probability matrix
prob_matrix = []
for i in range(0, 4):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], X)
    prob_matrix.append(h)
prob_matrix = pd.DataFrame(prob_matrix)

In [128]:
prob_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599
0,0.93305,0.009513,0.986085,0.00116,0.001899,0.024834,0.00041,0.019386,0.005798,0.007739,...,0.98381,0.014773,0.033904,0.975675,0.009411,0.01734,0.006,0.012412,0.00268,0.007463
1,0.01065,0.991942,0.001723,0.000973,0.00269,0.989163,0.027332,0.002703,0.000456,0.002995,...,0.007027,0.981519,0.000245,0.004128,0.001495,0.001175,0.013616,0.000289,0.013894,0.001173
2,0.002442,0.001005,0.00312,0.99285,0.985595,0.001516,0.990011,0.136811,0.995372,0.003828,...,0.002792,0.001676,0.983453,0.003623,0.027962,0.974026,0.011614,0.986246,0.002235,0.003954
3,0.021662,0.016846,0.009441,0.037274,0.015057,0.00328,0.004139,0.610753,0.009104,0.995102,...,0.006807,0.012364,0.008871,0.004986,0.920703,0.005438,0.895939,0.01158,0.995636,0.997923


In [129]:
# Refactor prediction matrix for accuracy testing
for col in range(0,1600):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if prob_matrix[col][row] > prob_matrix[col][max_prob_index]:
            max_prob_index = row
    for row in range(0,4):
        if row == max_prob_index:
            prob_matrix[col][row] = 1
        else:
            prob_matrix[col][row] = 0

prob_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [130]:
# Get accuracy score through sklearn
from sklearn.metrics import accuracy_score
y_pred = prob_matrix.T
y_true = y1
score = accuracy_score(y_true, y_pred)

# 0.980625 for 6000 epochs, 0.06 learning rate
# 0.980625 for 6000 epochs, 0.08 learning rate
# 0.15, 6000 -> 0.98125
# 0.3, 6000 -> 0.98125
# 0.5, 4000 -> 0.381875
# 0.3, 4000 -> 0.381875
score

0.98125