In [166]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [167]:
# Drop all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    df.drop(columns=['Index'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        if column == 'Hogwarts House':
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [168]:
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T))))

In [169]:
# Read test dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_test.csv")
except:
    print(f"{colors().RED}Error: could not read dataset test file{colors().END}")
    exit()
drop_columns(df)

In [170]:
# Read theta file
try:
    theta = np.loadtxt('theta.txt')
except:
    print(f"{colors().RED}Error: could not read theta.txt file. Train the model first!{colors().END}")
    exit()
theta

array([[-1.98564205,  0.35971699,  0.07560657, -0.65372273],
       [-1.01737624,  0.17810731, -0.07009833, -0.15124263],
       [-2.85128161, -1.60380949,  1.07304057,  2.17314934],
       [ 0.95722307, -1.82414664, -1.65831259,  2.2254142 ],
       [ 1.05764953,  1.89363013, -0.97104679, -2.86651407],
       [ 0.38204486, -2.77152843,  0.67753701,  0.8930923 ],
       [ 2.36240549, -0.84459806, -0.44202777, -1.49391785],
       [ 1.70320836, -1.30576897,  1.63606276, -2.80462193],
       [-0.38959848,  0.65826954, -2.09518444,  1.12689259],
       [-0.47265054,  0.76312768, -2.32650802,  1.28701875],
       [-0.74802519,  1.39894325, -0.73250576, -0.94630632],
       [-0.70220588,  0.1071449 , -0.01130774, -0.22177326],
       [ 2.62027335, -1.3378233 , -1.33626783, -0.19376811],
       [-1.02547168, -1.12584365,  2.24959613, -1.40361996]])

In [143]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc': # https://stackoverflow.com/a/38185438
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # make missing data zero
        if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
            df.iloc[i, df.columns.get_loc(column)] = 0

In [144]:
X = df.drop(columns=["Hogwarts House"])
# Fix overflow
X = X.astype(np.float128)

In [145]:
X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.389736,0.896002,0.656189,0.103998,0.963781,0.341390,0.211814,0.649164,0.724280,0.331983,0.534007,0.454333,0.304965
1,1,0.428703,0.258180,0.653352,0.741820,0.865670,0.789595,0.813900,0.777457,0.731072,0.664969,0.567509,0.843836,0.275291
2,1,0.409272,0.671351,0.180829,0.328649,0.762614,0.207635,0.771464,0.092462,0.152850,0.207042,0.201422,0.202975,0.806832
3,1,0.601788,0.723300,0.756481,0.000000,0.000000,0.298966,0.311040,0.000000,0.706951,0.399849,0.664154,0.468068,0.310764
4,1,0.000000,0.652242,0.693237,0.347758,0.778519,0.298687,0.190496,0.573330,0.748292,0.268872,0.489287,0.468616,0.320225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,0.434008,0.148580,0.208612,0.851420,0.261154,0.226977,0.211341,0.810976,0.728972,0.795131,0.782324,0.338630,0.114168
396,1,0.496870,0.857824,0.849035,0.142176,0.683684,0.413742,0.000000,0.632857,0.666977,0.068191,0.295692,0.383515,0.300481
397,1,0.603374,0.654496,0.770531,0.345504,0.739441,0.128387,0.327003,0.631533,0.843696,0.526223,0.096201,0.468165,0.339876
398,1,0.540753,0.045616,0.848040,0.954384,0.720260,0.784233,0.844156,0.822142,0.700015,0.598974,0.342673,0.933878,0.294049


In [146]:
theta

array([[-1.30842597,  0.09063195, -0.00584075, -0.43364815],
       [-0.71314924,  0.04040232, -0.0591221 , -0.13050905],
       [-2.02244881, -1.14375572,  0.78868873,  1.44543389],
       [ 0.58999255, -1.21888822, -1.14768385,  1.37558249],
       [ 0.79508076,  1.21619153, -0.77291328, -1.90004048],
       [ 0.14164945, -1.87535562,  0.42871403,  0.56613219],
       [ 1.56779887, -0.5419181 , -0.38267174, -0.98790389],
       [ 1.04565092, -0.88903655,  1.05867896, -1.82592347],
       [-0.22701677,  0.41575165, -1.50428777,  0.69156963],
       [-0.27389345,  0.46817948, -1.66470866,  0.78007976],
       [-0.46991131,  0.87727553, -0.58750899, -0.63378251],
       [-0.51837055,  0.01010648, -0.02988891, -0.16102216],
       [ 1.689236  , -0.8510132 , -0.98577341, -0.18611349],
       [-0.74345358, -0.83579798,  1.53027328, -0.86687729]])

In [147]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([0.84629988, 0.0994094 , 0.14282055, 0.63302619, 0.72432855,
       0.19160855, 0.06493524, 0.82054384, 0.0848257 , 0.67504015,
       0.91636307, 0.16786683, 0.09876902, 0.17164146, 0.16105422,
       0.93275762, 0.13079273, 0.20222029, 0.10117381, 0.83110487,
       0.78077029, 0.14623555, 0.12944182, 0.05664467, 0.49833938,
       0.71321823, 0.04374951, 0.11324745, 0.17132754, 0.06195397,
       0.81106882, 0.70789126, 0.16371735, 0.83084325, 0.80501463,
       0.2101149 , 0.22544631, 0.07166453, 0.12557087, 0.06383188,
       0.06636341, 0.14098687, 0.92499131, 0.15293824, 0.80150546,
       0.14012889, 0.15063544, 0.33621446, 0.1115671 , 0.11586696,
       0.15203681, 0.83320049, 0.08701551, 0.13341027, 0.81306918,
       0.161659  , 0.11195215, 0.10499092, 0.78203472, 0.23880992,
       0.16683335, 0.17771137, 0.73667961, 0.80071703, 0.07932015,
       0.73921947, 0.59948297, 0.128912  , 0.75185867, 0.06997718,
       0.09132069, 0.17260814, 0.31623269, 0.12297208, 0.13364

In [148]:
output = []
for i in range(0, 4):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], X)
    output.append(h)
output=pd.DataFrame(output)
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.093364,0.783801,0.133383,0.106566,0.209846,0.198913,0.831739,0.067117,0.860733,0.175514,...,0.317842,0.113084,0.170136,0.7098,0.7838,0.234986,0.106651,0.139605,0.901013,0.502929
1,0.029073,0.072424,0.036849,0.114809,0.062646,0.815814,0.070837,0.048729,0.053297,0.063683,...,0.610852,0.03443,0.030017,0.188279,0.023708,0.714859,0.039998,0.078574,0.101478,0.044311
2,0.116774,0.031527,0.834174,0.176279,0.084143,0.025503,0.046622,0.098825,0.041107,0.075497,...,0.040849,0.077485,0.285661,0.116508,0.049136,0.02543,0.088052,0.064732,0.017205,0.071186
3,0.8463,0.099409,0.142821,0.633026,0.724329,0.191609,0.064935,0.820544,0.084826,0.67504,...,0.146866,0.832829,0.680954,0.021805,0.154987,0.150463,0.89593,0.710114,0.060521,0.608259


In [165]:
prediction_array = []
#eval_output = output
for col in range(0,400):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    #for row in range(0,4):        
        #if row == max_prob_index:
        #    eval_output[col][row] = 1
        #else:
        #    eval_output[col][row] = 0
    match max_prob_index:
        case 0:
            prediction_array.append([col, "Ravenclaw"])
        case 1:
            prediction_array.append([col, "Slytherin"])
        case 2:
            prediction_array.append([col, "Gryffindor"])
        case 3:
            prediction_array.append([col, "Hufflepuff"])
        
       
#eval_output = pd.DataFrame(eval_output)
eval_output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [164]:
prediction = pd.DataFrame(prediction_array)
#prediction.columns = ['Index','Hogwarts House']
prediction

Unnamed: 0,0,1
0,0,Hufflepuff
1,1,Ravenclaw
2,2,Gryffindor
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Slytherin
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [151]:
prediction.to_csv('houses.csv',index=False)

In [160]:
prediction = pd.DataFrame(prediction['Hogwarts House'].T)
prediction

Unnamed: 0,Hogwarts House
0,Hufflepuff
1,Ravenclaw
2,Gryffindor
3,Hufflepuff
4,Hufflepuff
...,...
395,Slytherin
396,Hufflepuff
397,Hufflepuff
398,Ravenclaw


In [161]:
y1

Unnamed: 0,0
0,1.0
1,1.0
2,1.0
3,1.0
4,1.0
...,...
395,1.0
396,1.0
397,1.0
398,1.0


In [163]:
from sklearn.metrics import accuracy_score
y_pred = prediction
y_true = y1
score = accuracy_score(y_true, y_pred)

#score2 = accuracy_score(y_true, y_pred, normalize=False)
score

0.0