In [99]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [100]:
# drops all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    df.drop(columns=['Index'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        if column == 'Hogwarts House':
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [103]:
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T)))) - 0.0000001

In [104]:
# Read test dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_test.csv")
except:
    print(f"{colors().RED}Error: could not read dataset test file{colors().END}")
    exit()
drop_columns(df)

In [105]:
# Read theta file
try:
    theta = np.loadtxt('theta.txt')
except:
    print(f"{colors().RED}Error: could not read theta.txt file. Train the model first!{colors().END}")
    exit()

In [106]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc': # https://stackoverflow.com/a/38185438
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # make missing data zero
        if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
            df.iloc[i, df.columns.get_loc(column)] = 0

In [107]:
X = df.drop(columns=["Hogwarts House"])
# fix overflow
X = X.astype(np.float128)

In [108]:
X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.389736,0.896002,0.656189,0.103998,0.963781,0.341390,0.211814,0.649164,0.724280,0.331983,0.534007,0.454333,0.304965
1,1,0.428703,0.258180,0.653352,0.741820,0.865670,0.789595,0.813900,0.777457,0.731072,0.664969,0.567509,0.843836,0.275291
2,1,0.409272,0.671351,0.180829,0.328649,0.762614,0.207635,0.771464,0.092462,0.152850,0.207042,0.201422,0.202975,0.806832
3,1,0.601788,0.723300,0.756481,0.000000,0.000000,0.298966,0.311040,0.000000,0.706951,0.399849,0.664154,0.468068,0.310764
4,1,0.000000,0.652242,0.693237,0.347758,0.778519,0.298687,0.190496,0.573330,0.748292,0.268872,0.489287,0.468616,0.320225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,0.434008,0.148580,0.208612,0.851420,0.261154,0.226977,0.211341,0.810976,0.728972,0.795131,0.782324,0.338630,0.114168
396,1,0.496870,0.857824,0.849035,0.142176,0.683684,0.413742,0.000000,0.632857,0.666977,0.068191,0.295692,0.383515,0.300481
397,1,0.603374,0.654496,0.770531,0.345504,0.739441,0.128387,0.327003,0.631533,0.843696,0.526223,0.096201,0.468165,0.339876
398,1,0.540753,0.045616,0.848040,0.954384,0.720260,0.784233,0.844156,0.822142,0.700015,0.598974,0.342673,0.933878,0.294049


In [109]:
theta

array([[-0.86109874, -0.08037759, -0.08948608, -0.3005474 ],
       [-0.48395576, -0.04961991, -0.07931385, -0.10831676],
       [-1.36592318, -0.79852511,  0.51525423,  0.90774716],
       [ 0.33541117, -0.80055755, -0.78048331,  0.80676041],
       [ 0.543961  ,  0.71523615, -0.58768908, -1.21875386],
       [ 0.00745874, -1.23848265,  0.23469668,  0.33113939],
       [ 0.98028345, -0.35800334, -0.30826797, -0.6416909 ],
       [ 0.60663899, -0.6040243 ,  0.63643845, -1.14721692],
       [-0.13699574,  0.21477773, -1.04472373,  0.38871347],
       [-0.16575458,  0.2369563 , -1.15511685,  0.43563397],
       [-0.29438448,  0.48570565, -0.45661444, -0.42319896],
       [-0.3649095 , -0.05876129, -0.05914943, -0.12044309],
       [ 1.039717  , -0.55080571, -0.70389817, -0.16427318],
       [-0.52863188, -0.60135093,  0.97097382, -0.52474526]])

In [110]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([0.70238563, 0.16036509, 0.22759597, 0.54271238, 0.60004876,
       0.25141583, 0.12657973, 0.67652952, 0.14745786, 0.56001207,
       0.78705068, 0.23228919, 0.17114268, 0.23533352, 0.22683935,
       0.80606716, 0.1917557 , 0.25606262, 0.16167173, 0.68745407,
       0.64603211, 0.21366456, 0.19305123, 0.11412506, 0.45188192,
       0.58757966, 0.09843648, 0.18664841, 0.24642902, 0.12177267,
       0.66701298, 0.58440475, 0.24414623, 0.6850272 , 0.66649848,
       0.24401163, 0.2908879 , 0.1311541 , 0.19643351, 0.12351328,
       0.1293005 , 0.20922781, 0.79701682, 0.22168954, 0.66469823,
       0.19531068, 0.21890394, 0.37263348, 0.17263303, 0.18390521,
       0.20553298, 0.69344731, 0.15107499, 0.19226292, 0.66885283,
       0.22359316, 0.18308294, 0.16537676, 0.63978859, 0.30044008,
       0.24610641, 0.25488963, 0.60180501, 0.65470114, 0.14298454,
       0.60401943, 0.50830743, 0.18747929, 0.61904454, 0.13028463,
       0.16150315, 0.23574182, 0.35991251, 0.18324583, 0.21404

In [128]:
output = []
for i in range(0, 4):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], X)
    output.append(h)
output=pd.DataFrame(output)
output

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.152259,0.640366,0.190392,0.173692,0.257599,0.281376,0.686421,0.126327,0.714789,0.228643,...,0.357254,0.171214,0.221716,0.590361,0.629634,0.299836,0.169974,0.200967,0.766779,0.445418
1,0.062818,0.104595,0.077025,0.153122,0.105079,0.630385,0.105084,0.084499,0.0889,0.101872,...,0.465688,0.068733,0.068567,0.194066,0.052264,0.530219,0.079796,0.115198,0.130043,0.087037
2,0.154737,0.05846,0.680612,0.211592,0.127756,0.059531,0.076025,0.137767,0.070538,0.115848,...,0.078481,0.117579,0.287659,0.139479,0.077622,0.05674,0.134864,0.105052,0.039158,0.114623
3,0.702386,0.160365,0.227596,0.542712,0.600049,0.251416,0.12658,0.67653,0.147458,0.560012,...,0.21508,0.685409,0.576214,0.069096,0.207637,0.214647,0.759204,0.584089,0.118386,0.514439


In [133]:
prediction_array = []
#eval_output = output
for col in range(0,400):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    #for row in range(0,4):        
        #if row == max_prob_index:
        #    eval_output[col][row] = 1
        #else:
        #    eval_output[col][row] = 0
    match max_prob_index:
        case 0:
            prediction_array.append([col, "Ravenclaw"])
        case 1:
            prediction_array.append([col, "Slytherin"])
        case 2:
            prediction_array.append([col, "Gryffindor"])
        case 3:
            prediction_array.append([col, "Hufflepuff"])
        
       
#eval_output = pd.DataFrame(eval_output)
#eval_output

In [134]:
prediction = pd.DataFrame(prediction_array)
prediction.columns = ['Index','Hogwarts House']
prediction

Unnamed: 0,Index,Hogwarts House
0,0,Hufflepuff
1,1,Ravenclaw
2,2,Gryffindor
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Slytherin
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [136]:
prediction.to_csv('houses.csv',index=False)

In [72]:
from sklearn.metrics import accuracy_score
y_pred = eval_output.T
y_true = y1
score = accuracy_score(y_true, y_pred)

score2 = accuracy_score(y_true, y_pred, normalize=False)
score

NameError: name 'y1' is not defined