In [3]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [4]:
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T)))) - 0.0000001

In [5]:
def cost(X, y, theta):
    y1 = hypothesis(X, theta)
    return -(1/len(X)) * np.sum(y * np.log(y1) + (1 - y) * np.log(1 - y1))

In [6]:
# drops all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    # drop index column
    df.drop(columns=['Index'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        if column == 'Hogwarts House':
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [8]:
# CHANGE THIS TO MY GRADIENT DESCENT
def gradient_descent(X, y, theta, L, epochs):
    n = len(X)
    theta = pd.DataFrame(theta)
    for i in range(0, epochs):
        for j in range(0, theta.shape[1]):
            # [:,j] takes the j'th column of the theta 2d array
            h = hypothesis(theta.iloc[:,j], X)
            for k in range(0, theta.shape[0]):
                theta.iloc[k, j] -= (L/n) * np.sum((h - y.iloc[:, j]) * X.iloc[:, k])
        
    return theta

In [9]:
def predict(X, y):
    accuracy = 0
    for i in range(4):
        h = hypothesis(theta1.iloc[:,j], X)
        for n in range(0, len(h)):
            if h[n] >= 0.5 and y1.iloc[n, i] == 1:
                accuracy += 1
            elif h[n] < 0.5 and y1.iloc[n, i] == 0:
                accuracy += 1
    return accuracy   

In [10]:
# Read dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_train.csv")
except:
    print(f"{colors().RED}Error: could not read file{colors().END}")
    exit()
drop_columns(df)

In [11]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc': # https://stackoverflow.com/a/38185438
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # make missing data zero
        if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
            #print(df.iloc[i, df.columns.get_loc(column)])
            df.iloc[i, df.columns.get_loc(column)] = 0
            #print(df.iloc[i, df.columns.get_loc(column)])

In [12]:
#plt.scatter(df.Flying, df["Hogwarts House"])

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
#split in input and output variables
df_without_house = df.drop(columns=["Hogwarts House"])
df_only_house = df["Hogwarts House"]

In [15]:
df_without_house

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0.639887,0.241486,0.731351,0.758514,0.716936,0.623499,0.538679,0.679081,0.692354,0.465490,0.632571,0.793213,0.335649
1,0.708357,0.209123,0.196645,0.790877,0.166054,0.274983,0.181768,0.624914,0.791954,0.655153,0.534573,0.248862,0.147696
2,0.371712,0.302914,0.822541,0.697086,0.792526,0.803801,0.690568,0.598330,0.943484,0.736308,0.439286,0.946253,0.460090
3,0.441033,0.839396,0.173377,0.160604,0.679834,0.252191,0.520257,0.195162,0.071565,0.302694,0.517978,0.117970,0.829700
4,0.653604,0.707791,0.112971,0.000000,0.584413,0.294754,0.683503,0.260962,0.160179,0.494046,0.478728,0.130868,0.737070
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,0.567396,0.666189,0.262629,0.333811,0.769177,0.270442,0.724445,0.174819,0.299445,0.443483,0.418148,0.299121,0.797542
1596,0.677868,0.672871,0.746591,0.327129,0.558878,0.203419,0.350918,0.534834,0.777411,0.577087,0.467757,0.410483,0.491315
1597,0.682577,0.761874,0.323727,0.238126,0.788528,0.321883,0.761260,0.138550,0.245615,0.351388,0.531417,0.264231,0.824988
1598,0.828008,0.716314,0.627083,0.283686,0.824404,0.116920,0.215752,0.610742,0.942760,0.471727,0.436688,0.417122,0.227255


In [16]:
# House belonging matrix (get's filled with '1' for corresponding house)
y1 = np.zeros([df_without_house.shape[0], len(df_only_house.unique())])
y1 = pd.DataFrame(y1)
y1

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
...,...,...,...,...
1595,0.0,0.0,0.0,0.0
1596,0.0,0.0,0.0,0.0
1597,0.0,0.0,0.0,0.0
1598,0.0,0.0,0.0,0.0


In [17]:
for i in range(0, len(df_only_house.unique())):
    for j in range(0, len(y1)):
        if df_only_house[j] == df_only_house.unique()[i]:
            y1.iloc[j, i] = 1
        else:
            y1.iloc[j, i] = 0
y1.head()

Unnamed: 0,0,1,2,3
0,1.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [18]:
y1.tail()

Unnamed: 0,0,1,2,3
1595,0.0,0.0,1.0,0.0
1596,0.0,1.0,0.0,0.0
1597,0.0,0.0,1.0,0.0
1598,0.0,0.0,0.0,1.0
1599,0.0,0.0,0.0,1.0


In [19]:
y1.shape

(1600, 4)

In [20]:
theta = np.zeros([df_without_house.shape[1]+1, y1.shape[1]])
theta.shape

(14, 4)

In [21]:
#add a bias column
X = pd.concat([pd.Series(1, index=df_without_house.index, name='bias'), df_without_house], axis=1)
X.shape

(1600, 14)

In [22]:
# theta array, all filled with '1'
theta = np.ones(df_without_house.shape[1]+1)
theta.shape

(14,)

In [23]:
h = hypothesis(theta, X)
h

array([0.99985635, 0.99854442, 0.99993251, ..., 0.99927891, 0.99955555,
       0.999556  ])

In [24]:
theta = pd.DataFrame(theta)
type(theta.iloc[:,0])

pandas.core.series.Series

In [25]:
# WHAT IS BIG X????
theta = np.zeros([df_without_house.shape[1]+1, y1.shape[1]])
theta = gradient_descent(X, y1, theta, 0.02, 1000) # CHANGE the epochs!
theta

Unnamed: 0,0,1,2,3
0,-0.861099,-0.080378,-0.089486,-0.300547
1,-0.483956,-0.04962,-0.079314,-0.108317
2,-1.365923,-0.798525,0.515254,0.907747
3,0.335411,-0.800558,-0.780483,0.80676
4,0.543961,0.715236,-0.587689,-1.218754
5,0.007459,-1.238483,0.234697,0.331139
6,0.980283,-0.358003,-0.308268,-0.641691
7,0.606639,-0.604024,0.636438,-1.147217
8,-0.136996,0.214778,-1.044724,0.388713
9,-0.165755,0.236956,-1.155117,0.435634


In [47]:
# export thetas to .txt file
np.savetxt('theta.txt', theta)


array([[-0.86109874, -0.08037759, -0.08948608, -0.3005474 ],
       [-0.48395576, -0.04961991, -0.07931385, -0.10831676],
       [-1.36592318, -0.79852511,  0.51525423,  0.90774716],
       [ 0.33541117, -0.80055755, -0.78048331,  0.80676041],
       [ 0.543961  ,  0.71523615, -0.58768908, -1.21875386],
       [ 0.00745874, -1.23848265,  0.23469668,  0.33113939],
       [ 0.98028345, -0.35800334, -0.30826797, -0.6416909 ],
       [ 0.60663899, -0.6040243 ,  0.63643845, -1.14721692],
       [-0.13699574,  0.21477773, -1.04472373,  0.38871347],
       [-0.16575458,  0.2369563 , -1.15511685,  0.43563397],
       [-0.29438448,  0.48570565, -0.45661444, -0.42319896],
       [-0.3649095 , -0.05876129, -0.05914943, -0.12044309],
       [ 1.039717  , -0.55080571, -0.70389817, -0.16427318],
       [-0.52863188, -0.60135093,  0.97097382, -0.52474526]])

In [26]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([0.21758865, 0.23240275, 0.1954142 , ..., 0.25752339, 0.64077056,
       0.69015536])

In [27]:
output = []
for i in range(0, 4):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], X)
    output.append(h)
output=pd.DataFrame(output)

In [40]:
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1590,1591,1592,1593,1594,1595,1596,1597,1598,1599
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


In [41]:
output.iloc[2, 50]

1.0

In [42]:
eval_output = output
for col in range(0,1600):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        print(output[col][row], " vs ", output[col][max_prob_index])
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    for row in range(0,4):
        if row == max_prob_index:
            eval_output[col][row] = 1
        else:
            eval_output[col][row] = 0
        
       
eval_output = pd.DataFrame(eval_output)
#eval_output

1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0

0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0

0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
1.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0
0.0  vs  0.0
1.0  vs  0.0
0.0  vs  1.0
0.0  vs  0.0

In [33]:
#accuracy = 0
#for col in range(0, 4):
#    for row in range(len(y1)):
#        if y1.iloc[row, col] == 1 and output.iloc[col, row] >= 0.5:
#            accuracy += 1
#accuracy= accuracy/len(X)
#accuracy

In [36]:
from sklearn.metrics import accuracy_score
y_pred = eval_output.T
y_true = y1
score = accuracy_score(y_true, y_pred)

score2 = accuracy_score(y_true, y_pred, normalize=False)
score

0.97375