In [186]:
import pandas as pd
from colors import colors
from matplotlib import pyplot as plt
import math
import numpy as np

In [187]:
# Drop all non-numeric columns but "Hogwarts House"
def drop_columns(df):
    df.drop(columns=['Index', 'Hogwarts House'], inplace=True)
    for column in df:
        if df[column].dtype.kind in 'biufc': # https://stackoverflow.com/a/38185438
            continue
        else:
            df.drop(columns=[column], inplace=True)

In [188]:
# Calculate probability matrix
def hypothesis(theta, X):
    return 1 / (1 + np.exp(-(np.dot(theta, X.T))))

In [189]:
# Read test dataset and preprocess it
try:
    df = pd.read_csv("datasets/dataset_test.csv")
except:
    print(f"{colors().RED}Error: could not read dataset test file{colors().END}")
    exit()
drop_columns(df)

In [190]:
# Read theta file
try:
    theta = np.loadtxt('theta.txt')
except:
    print(f"{colors().RED}Error: Could not read theta.txt file. Train the model first!{colors().END}")
    exit()
# Fix overflow
theta = theta.astype(np.double)

In [191]:
# Normalize data
for column in df:
    if df[column].dtype.kind not in 'biufc':
        continue
    max_norm = df[column].max()
    min_norm = df[column].min()

    # Put column mean for NaN values
    for i in range(len(df)):
        if column != "Hogwarts House":
            if math.isnan(df.iloc[i, df.columns.get_loc(column)]) == True:
                df.iloc[i, df.columns.get_loc(column)] = float(df[column].mean())
    # Perform min-max normalization
    for i in range(len(df)):
        df.iloc[i, df.columns.get_loc(column)] = (df.iloc[i, df.columns.get_loc(column)] - min_norm) / (max_norm - min_norm)

In [192]:
X = df
# Fix overflow
X = X.astype(np.double)

In [193]:
X = pd.concat([pd.Series(1, index=X.index, name='bias'), X], axis=1)
X

Unnamed: 0,bias,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,1,0.389736,0.896002,0.656189,0.103998,0.963781,0.341390,0.211814,0.649164,0.724280,0.331983,0.534007,0.454333,0.304965
1,1,0.428703,0.258180,0.653352,0.741820,0.865670,0.789595,0.813900,0.777457,0.731072,0.664969,0.567509,0.843836,0.275291
2,1,0.409272,0.671351,0.180829,0.328649,0.762614,0.207635,0.771464,0.092462,0.152850,0.207042,0.201422,0.202975,0.806832
3,1,0.601788,0.723300,0.756481,0.487975,0.690658,0.298966,0.311040,0.570680,0.706951,0.399849,0.664154,0.468068,0.310764
4,1,0.478457,0.652242,0.693237,0.347758,0.778519,0.298687,0.190496,0.573330,0.748292,0.268872,0.489287,0.468616,0.320225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,0.434008,0.148580,0.208612,0.851420,0.261154,0.226977,0.211341,0.810976,0.728972,0.795131,0.782324,0.338630,0.114168
396,1,0.496870,0.857824,0.849035,0.142176,0.683684,0.413742,0.000000,0.632857,0.666977,0.068191,0.295692,0.383515,0.300481
397,1,0.603374,0.654496,0.770531,0.345504,0.739441,0.128387,0.327003,0.631533,0.843696,0.526223,0.096201,0.468165,0.339876
398,1,0.540753,0.045616,0.848040,0.954384,0.720260,0.784233,0.844156,0.822142,0.700015,0.598974,0.342673,0.933878,0.294049


In [194]:
theta = pd.DataFrame(theta)
hypothesis(theta.iloc[:,3], X)

array([9.98261311e-01, 3.02356430e-03, 2.61772132e-03, 9.15922778e-01,
       9.79791178e-01, 1.92430382e-02, 6.48440361e-04, 9.97081970e-01,
       1.66752572e-03, 9.67554091e-01, 9.99806335e-01, 3.14772852e-03,
       1.54551154e-03, 1.25259719e-02, 1.01585662e-02, 9.98807194e-01,
       7.10756390e-03, 2.69364557e-02, 3.24048308e-03, 9.97592058e-01,
       9.98601731e-01, 7.02753870e-03, 5.60599846e-03, 4.93326316e-04,
       9.69560080e-01, 9.80478061e-01, 1.71334911e-04, 4.47940141e-02,
       6.46377909e-03, 5.92824870e-04, 9.96755744e-01, 9.26065909e-01,
       5.08593531e-03, 9.98115481e-01, 9.95136794e-01, 6.99568750e-02,
       1.75371555e-02, 1.09271911e-03, 3.69858965e-03, 7.11719360e-04,
       6.01529297e-04, 6.21429236e-03, 9.94470866e-01, 6.84114666e-03,
       9.99011516e-01, 9.81665738e-03, 6.91908582e-03, 7.18048412e-02,
       4.20402439e-03, 3.52514456e-03, 1.70037100e-02, 9.97321410e-01,
       1.80868230e-03, 7.72647426e-03, 9.96774620e-01, 1.27974995e-02,
      

In [195]:
prob_matrix = []
for i in range(0, 4):
    tmp_theta = pd.DataFrame(theta)
    h = hypothesis(tmp_theta.iloc[:,i], X)
    prob_matrix.append(h)
prob_matrix=pd.DataFrame(prob_matrix)
prob_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
0,0.00759,0.990232,0.023799,0.021155,0.025999,0.004922,0.995447,0.002109,0.997914,0.032282,...,0.027012,0.013203,0.029976,0.988713,0.99448,0.015438,0.009055,0.013169,0.999013,0.985129
1,0.000313,0.004107,0.000641,0.007743,0.003604,0.998997,0.004042,0.002014,0.001452,0.003832,...,0.976808,0.000552,0.000753,0.013403,0.00014,0.994284,0.000762,0.008992,0.011411,0.001849
2,0.020155,0.008317,0.996796,0.007935,0.008388,0.000573,0.021769,0.011769,0.015691,0.007921,...,0.003172,0.006751,0.015495,0.039399,0.023836,0.001028,0.004514,0.004111,0.001977,0.005074
3,0.998261,0.003024,0.002618,0.915923,0.979791,0.019243,0.000648,0.997082,0.001668,0.967554,...,0.006271,0.998154,0.993633,0.000203,0.015013,0.009011,0.999613,0.983472,0.000735,0.191985


In [196]:
prediction_array = []
for col in range(0,400):
    max_prob_index = 0
    i = 0
    for row in range(0,4):
        if output[col][row] > output[col][max_prob_index]:
            max_prob_index = row
    match max_prob_index:
        case 0:
            prediction_array.append([col, "Ravenclaw"])
        case 1:
            prediction_array.append([col, "Slytherin"])
        case 2:
            prediction_array.append([col, "Gryffindor"])
        case 3:
            prediction_array.append([col, "Hufflepuff"])

In [197]:
prediction = pd.DataFrame(prediction_array)
prediction.columns = ['Index','Hogwarts House']
prediction

Unnamed: 0,Index,Hogwarts House
0,0,Hufflepuff
1,1,Hufflepuff
2,2,Hufflepuff
3,3,Hufflepuff
4,4,Hufflepuff
...,...,...
395,395,Ravenclaw
396,396,Hufflepuff
397,397,Hufflepuff
398,398,Ravenclaw


In [198]:
prediction.to_csv('houses.csv',index=False)