In [53]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [54]:
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T)))) - 0.0000001

In [55]:
def cost(X, y, theta):
    y1 = hypothesis(X, theta)
    return -(1/len(X)) * np.sum(y * np.log(y1) + (1 - y) * np.log(1 - y1))

In [56]:
# drops all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    # drop index column
    df.drop(columns=['Index'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        if column == 'Hogwarts House':
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [57]:
# CHANGE THIS TO MY GRADIENT DESCENT
def gradient_descent(X, y, theta, L, epochs):
    n = len(X)
    theta = pd.DataFrame(theta)
    for i in range(0, epochs):
        for j in range(0, theta.shape[1]):
            # [:,j] takes the j'th column of the theta 2d array
            h = hypothesis(theta.iloc[:,j], X)
            for k in range(0, theta.shape[0]):
                theta.iloc[k, j] -= (L/n) * np.sum((h - y.iloc[:, j]) * X.iloc[:, k])
        
    return theta

In [58]:
def predict(X, y):
    accuracy = 0
    for i in range(4):
        h = hypothesis(theta1.iloc[:,j], X)
        for n in range(0, len(h)):
            if h[n] >= 0.5 and y1.iloc[n, i] == 1:
                accuracy += 1
            elif h[n] < 0.5 and y1.iloc[n, i] == 0:
                accuracy += 1
    return accuracy   

In [87]:
# Read dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_train.csv")
except:
    print(f"{colors().RED}Error: could not read file{colors().END}")
    exit()
drop_columns(df)
df

Unnamed: 0,Hogwarts House,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,Ravenclaw,58384.0,-487.886086,5.727180,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,Slytherin,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,Ravenclaw,23702.0,-366.076117,7.725017,3.660761,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,Gryffindor,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.014040,-256.84675,200.64
4,Gryffindor,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.264070,-256.38730,157.98
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,49009.0,354.280086,-4.541837,-3.542801,5.702,-497.235066,618.220213,-5.231721,964.219853,3.389086,-0.649983,-250.39401,185.83
1596,Slytherin,63296.0,367.531174,6.061064,-3.675312,1.757,-643.271092,445.827565,2.238112,1056.147366,5.825263,-0.333962,-246.42719,44.80
1597,Gryffindor,63905.0,544.018925,-3.203269,-5.440189,6.065,-385.150457,635.211486,-5.984257,953.866685,1.709808,0.071569,-251.63679,198.47
1598,Hufflepuff,82713.0,453.676219,3.442831,-4.536762,6.738,-831.741123,383.444937,3.813111,1087.949205,3.904100,-0.531875,-246.19072,-76.81


In [88]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc': # https://stackoverflow.com/a/38185438
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # make missing data zero
        if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
            #print(df.iloc[i, df.columns.get_loc(column)])
            df.iloc[i, df.columns.get_loc(column)] = 0
            #print(df.iloc[i, df.columns.get_loc(column)])

In [89]:
#plt.scatter(df.Flying, df["Hogwarts House"])

In [90]:
from sklearn.model_selection import train_test_split

In [91]:
#split in input and output variables
df_without_house = df.drop(columns=["Hogwarts House"])
df_only_house = df["Hogwarts House"]

In [64]:
df_without_house

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,0.653604,0.707791,0.112971,0.000000,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [65]:
# House belonging matrix (get's filled with '1' for corresponding house)
y1 = np.zeros([df_without_house.shape[0], len(df_only_house.unique())])
y1 = pd.DataFrame(y1)
y1

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
1595,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0
1597,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0


In [66]:
for i in range(0, len(df_only_house.unique())):
    for j in range(0, len(y1)):
        if df_only_house[j] == df_only_house.unique()[i]:
            y1.iloc[j, i] = 1
        else:
            y1.iloc[j, i] = 0
y1.head()

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [67]:
y1.tail()

Unnamed: 0,0,1,2,3
1595,0.0,0.0,1.0,0.0
1596,0.0,1.0,0.0,0.0
1597,0.0,0.0,1.0,0.0
1598,0.0,0.0,0.0,1.0
1599,0.0,0.0,0.0,1.0


In [68]:
y1.shape

(1600, 4)

In [69]:
theta = np.zeros([df_without_house.shape[1]+1, y1.shape[1]])
theta.shape

(14, 4)

In [70]:
#add a bias column
X = pd.concat([pd.Series(1, index=df_without_house.index, name='bias'), df_without_house], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,1,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,1,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,1,0.653604,0.707791,0.112971,0.000000,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,1,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,1,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,1,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [71]:
# theta array, all filled with '1'
theta = np.ones(df_without_house.shape[1]+1)
theta.shape

(14,)

In [72]:
h = hypothesis(theta, X)
h

array([0.99985635, 0.99854442, 0.99993251, ..., 0.99927891, 0.99955555,
       0.999556  ])

In [73]:
theta = pd.DataFrame(theta)
type(theta.iloc[:,0])

pandas.core.series.Series

In [74]:
# WHAT IS BIG X????
theta = np.zeros([df_without_house.shape[1]+1, y1.shape[1]])
theta = gradient_descent(X, y1, theta, 0.02, 1000) # CHANGE the epochs!
theta

Unnamed: 0,0,1,2,3
0,-0.861099,-0.080378,-0.089486,-0.300547
1,-0.483956,-0.04962,-0.079314,-0.108317
2,-1.365923,-0.798525,0.515254,0.907747
3,0.335411,-0.800558,-0.780483,0.80676
4,0.543961,0.715236,-0.587689,-1.218754
5,0.007459,-1.238483,0.234697,0.331139
6,0.980283,-0.358003,-0.308268,-0.641691
7,0.606639,-0.604024,0.636438,-1.147217
8,-0.136996,0.214778,-1.044724,0.388713
9,-0.165755,0.236956,-1.155117,0.435634


In [75]:
# export thetas to .txt file
np.savetxt('theta.txt', theta)


In [85]:
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,1,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,1,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,1,0.653604,0.707791,0.112971,0.000000,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,1,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,1,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,1,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,1,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [77]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([0.21758865, 0.23240275, 0.1954142 , ..., 0.25752339, 0.64077056,
       0.69015536])

In [78]:
output = []
for i in range(0, 4):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], X)
    output.append(h)
output=pd.DataFrame(output)

In [79]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599
0,0.542954,0.26183,0.631422,0.105236,0.115641,0.306095,0.089511,0.195999,0.136504,0.174364,...,0.655878,0.271895,0.186276,0.572144,0.154382,0.177556,0.162591,0.15069,0.136042,0.160592
1,0.127377,0.523636,0.091038,0.080807,0.097961,0.516369,0.15677,0.095527,0.06769,0.098646,...,0.124524,0.48713,0.057144,0.10226,0.08204,0.079876,0.12329,0.058561,0.119819,0.082223
2,0.062673,0.07471,0.050533,0.681012,0.629131,0.067266,0.703215,0.237061,0.674599,0.114048,...,0.056103,0.082545,0.585186,0.059797,0.142372,0.543368,0.137108,0.603811,0.10077,0.112755
3,0.217589,0.232403,0.195414,0.328718,0.284704,0.184929,0.244481,0.442324,0.26184,0.653399,...,0.187566,0.224842,0.242322,0.16878,0.52658,0.229799,0.503292,0.257523,0.640771,0.690155


In [80]:
output.iloc[2, 50]

0.5558578413972823

In [84]:
eval_output = output
for col in range(0,1600):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    for row in range(0,4):
        if row == max_prob_index:
            eval_output[col][row] = 1
        else:
            eval_output[col][row] = 0
        
       
eval_output = pd.DataFrame(eval_output)
#eval_output

In [82]:
#accuracy = 0
#for col in range(0, 4):
#    for row in range(len(y1)):
#        if y1.iloc[row, col] == 1 and output.iloc[col, row] >= 0.5:
#            accuracy += 1
#accuracy= accuracy/len(X)
#accuracy

In [83]:
from sklearn.metrics import accuracy_score
y_pred = eval_output.T
y_true = y1
score = accuracy_score(y_true, y_pred)

score2 = accuracy_score(y_true, y_pred, normalize=False)
score

0.97375