In [164]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [165]:
# Drop all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    df.drop(columns=['Index', 'Hogwarts House'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [166]:
# Calculate probability matrix
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T))))

In [167]:
# Read test dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_test.csv")
except:
    print(f"{colors().RED}Error: could not read dataset test file{colors().END}")
    exit()
drop_columns(df)

In [177]:
# Read theta file
try:
    theta = np.loadtxt('theta.txt')
except:
    print(f"{colors().RED}Error: Could not read theta.txt file. Train the model first!{colors().END}")
    exit()
# Fix overflow
theta = theta.astype(np.double)

In [178]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc':
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)
        # Make missing data zero?
        if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
            df.iloc[i, df.columns.get_loc(column)] = df[column].mean() # make this MEAN like in TRAIN

In [179]:
X = df
# Fix overflow
X = X.astype(np.double)

In [180]:
X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.000008,0.017860,0.488045,0.415536,0.286029,0.996934,0.000477,0.230151,0.000880,0.064747,0.534007,0.454333,0.304965
1,1,0.000009,0.005146,0.485935,0.831588,0.256912,0.999020,0.001832,0.275635,0.000888,0.129690,0.567509,0.843836,0.275291
2,1,0.000008,0.013382,0.134493,0.562075,0.226327,0.996311,0.001736,0.032781,0.000186,0.040380,0.201422,0.202975,0.806832
3,1,0.000012,0.014418,0.562638,0.009557,1.000000,0.996736,0.000700,1.000000,0.000859,0.077983,0.664154,0.468068,0.310764
4,1,1.000000,0.013001,0.515600,0.574541,0.231048,0.996735,0.000429,0.203265,0.000909,0.052438,0.489287,0.468616,0.320225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,0.000009,0.002962,0.155157,0.903081,0.077505,0.996401,0.000476,0.287519,0.000886,0.155075,0.782324,0.338630,0.114168
396,1,0.000010,0.017099,0.631476,0.440439,0.202903,0.997271,0.000000,0.224369,0.000810,0.013299,0.295692,0.383515,0.300481
397,1,0.000012,0.013046,0.573088,0.573070,0.219450,0.995942,0.000736,0.223900,0.001025,0.102630,0.096201,0.468165,0.339876
398,1,0.000011,0.000909,0.630736,0.970244,0.213757,0.998996,0.001900,0.291477,0.000850,0.116818,0.342673,0.933878,0.294049


In [181]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([5.72047404e-03, 4.34769849e-04, 5.34746685e-05, 6.78616443e-01,
       2.33604721e-03, 7.94363948e-05, 2.46909185e-04, 4.45715142e-03,
       3.09803395e-04, 2.40891779e-03, 1.09899808e-02, 2.28309246e-03,
       6.58505152e-05, 7.70080700e-05, 7.98264275e-05, 1.43015976e-01,
       4.20923214e-04, 8.45250698e-05, 4.69319445e-04, 3.93029079e-03,
       3.33331256e-02, 7.96466408e-05, 1.98981781e-04, 3.91165094e-04,
       4.10167565e-02, 3.09757736e-03, 1.64818603e-04, 4.83783369e-03,
       1.33186195e-04, 2.55659236e-04, 4.67921218e-03, 6.56521308e-02,
       8.78062916e-05, 5.50390566e-03, 3.66413639e-03, 1.41579263e-03,
       1.81837439e-04, 3.74152767e-04, 5.39688823e-05, 3.13567510e-04,
       2.12818036e-04, 6.45421982e-05, 3.56429853e-06, 1.24442072e-04,
       3.68997543e-02, 5.03628771e-04, 7.05290259e-05, 1.74558736e-04,
       5.41093576e-04, 4.48033251e-05, 8.83971861e-04, 3.76465385e-03,
       1.91430758e-04, 4.36214731e-04, 4.14627480e-03, 7.31965460e-05,
      

In [182]:
prob_matrix = []
for i in range(0, 4):
    tmp_theta = pd.DataFrame(theta)
    h = hypothesis(tmp_theta.iloc[:,i], X)
    prob_matrix.append(h)
prob_matrix=pd.DataFrame(prob_matrix)
prob_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.901465,0.994534,0.387802,0.835057,0.788849,0.757057,0.994589,0.853823,0.996198,0.933094,...,0.814905,0.942724,0.934735,0.990793,0.995853,0.847441,0.903302,0.941841,0.998174,0.995087
1,0.031345,0.062884,0.139717,0.000362,0.116852,0.925957,0.049226,0.042569,0.025796,0.037983,...,0.823408,0.029158,0.057001,0.105404,0.01308,0.850362,0.033249,0.051289,0.054109,0.017321
2,0.10432,0.021287,0.891172,0.039828,0.031621,0.064519,0.039566,0.110864,0.032329,0.07949,...,0.050032,0.059033,0.067816,0.034834,0.037219,0.043872,0.072311,0.075283,0.00979,0.021757
3,0.00572,0.000435,5.3e-05,0.678616,0.002336,7.9e-05,0.000247,0.004457,0.00031,0.002409,...,8.5e-05,0.006445,0.005123,0.000186,0.000856,6.9e-05,0.010457,0.002759,0.00038,0.00226


In [183]:
prediction_array = []
for col in range(0,400):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    match max_prob_index:
        case 0:
            prediction_array.append([col, "Ravenclaw"])
        case 1:
            prediction_array.append([col, "Slytherin"])
        case 2:
            prediction_array.append([col, "Gryffindor"])
        case 3:
            prediction_array.append([col, "Hufflepuff"])

In [184]:
prediction = pd.DataFrame(prediction_array)
prediction.columns = ['Index','Hogwarts House']
prediction

Unnamed: 0,Index,Hogwarts House
0,0,Hufflepuff
1,1,Hufflepuff
2,2,Hufflepuff
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Ravenclaw
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [185]:
prediction.to_csv('houses.csv',index=False)