# 

# Predictions

## Post-training analysis

### Training dataset

Loading the training dataset and keeping aside the column for real outcome.

In [None]:
import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns

filepath = f'../datasets/dataset_train.csv'
df = pd.read_csv(filepath)
target = 'Hogwarts House'
df_real_class = df[target]
df_real_class.head()

Loading weights - from the last training.
model_features : 
    Features used for training, after removing the intercept label

In [None]:
model_weights = pd.read_csv(f'../logistic_reg_model/gradient_descent_weights.csv')
# model_weights.drop(columns="Unnamed: 0", inplace=True)
features_label = model_weights.columns[0]
model_features = model_weights[features_label].to_list()[1:]
model_weights.head(10)

Dataset for testing - keeping only useful features

In [None]:
df_test = df[model_features]
df_test.head()

Standardize (z-score method), replace NaNs with zeros (which is the mean after standardization)

In [None]:
def standardize(arr: np.ndarray):
    return (arr - np.mean(arr)) / np.std(arr)

df_test_std = df_test.agg(lambda feature: standardize(feature))
df_test_std.fillna(0, inplace=True)
df_test_std.shape

Preparing test dataset and weights for numpy dot product

In [None]:
x_test = np.array(df_test_std)
ones = np.ones((len(x_test), 1), dtype=float)
x_test = np.concatenate((ones, x_test), axis=1)
weights = np.array(model_weights.drop(columns=features_label))
print(x_test.dtype, x_test.shape)
print(weights.dtype, weights.shape)

numpy dot product 

In [None]:
def sigmoid(arr:np.ndarray):
    return 1 / (1 + np.exp(-arr))

classifiers = model_weights.columns[1:].to_list()
z = np.dot(x_test, weights)
h = sigmoid(z)
y_pred_proba = pd.DataFrame(h, columns=classifiers)
y_pred_proba.head()


As a function that returns a DataFrame wth the probability

In [None]:
def predict_proba(df_test_std: pd.DataFrame, model_weights: pd.DataFrame) -> pd.DataFrame:
    x_test = np.array(df_test_std)
    ones = np.ones((len(x_test), 1), dtype=float)
    x_test = np.concatenate((ones, x_test), axis=1)
    weights = np.array(model_weights.drop(columns=features_label))
    classifiers = model_weights.columns[1:].to_list()
    z = np.dot(x_test, weights)
    h = sigmoid(z)
    y_pred_proba = pd.DataFrame(h, columns=classifiers)
    return y_pred_proba

def sigmoid(arr:np.ndarray):
    return 1 / (1 + np.exp(-arr))

y_pred_proba = predict_proba(df_test_std, model_weights)
y_pred_proba.head(10)

In [None]:
y_pred_proba['Probability'] = y_pred_proba.max(axis=1)

Heatmap of the Predicted output probability on a slice of dataset

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn

sns.heatmap(y_pred_proba.loc[500:600], cmap='YlGnBu')

Comparing real and predicted outcome


In [None]:
y_pred_proba['Predicted outcome'] = y_pred_proba.idxmax(axis=1)
y_pred_proba['Real outcome'] = df_real_class.to_list()
accurate_pred = np.where(y_pred_proba['Predicted outcome'] == y_pred_proba['Real outcome'], 1, 0)
y_pred_proba['Accurate pred.'] = accurate_pred
y_pred_proba.loc[500:510].head(10)

Inexact prediction still have a high Probabilty

In [None]:
y_pred_proba[y_pred_proba['Accurate pred.'] == 0]

In [None]:
(y_pred_proba['Accurate pred.'] == 1).value_counts()

In [None]:
(y_pred_proba['Accurate pred.'].value_counts(1))

In [None]:
accuracy = (y_pred_proba['Accurate pred.'].value_counts(1))[1]
print(f'Accuracy for the training dataset: {accuracy * 100}%')

### Testing dataset


In [None]:
filepath = f'../datasets/dataset_test.csv'
truthpath = f'../datasets/dataset_truth.csv'
df2 = pd.read_csv(filepath)
truth = pd.read_csv(truthpath)
target = 'Hogwarts House'
df_real_class = truth[target]
df_real_class.head()

In [None]:
df_test = df2[model_features]
df_test_std = df_test.agg(lambda feature: standardize(feature))
df_test_std.fillna(0, inplace=True)
df_test_std.shape

In [None]:
def predict_proba(df_test_std: pd.DataFrame, model_weights: pd.DataFrame) -> pd.DataFrame:
    x_test = np.array(df_test_std)
    ones = np.ones((len(x_test), 1), dtype=float)
    x_test = np.concatenate((ones, x_test), axis=1)
    weights = np.array(model_weights.drop(columns=features_label))
    classifiers = model_weights.columns[1:].to_list()
    z = np.dot(x_test, weights)
    h = sigmoid(z)
    y_pred_proba = pd.DataFrame(h, columns=classifiers)
    return y_pred_proba

def sigmoid(arr:np.ndarray):
    return 1 / (1 + np.exp(-arr))

y_pred_proba = predict_proba(df_test_std, model_weights)
y_pred_proba.head(10)

In [None]:
y_pred_proba['Probability'] = y_pred_proba.max(axis=1)
sns.heatmap(y_pred_proba.loc[:400], cmap='YlGnBu')


Is there probability < 0.5>

In [None]:
threshold = 0.5
y_pred_proba[y_pred_proba['Probability'] < threshold].count()

In [None]:
y_pred_proba['Predicted outcome'] = y_pred_proba.idxmax(axis=1)
y_pred_proba['Real outcome'] = df_real_class.to_list()
accurate_pred = np.where(y_pred_proba['Predicted outcome'] == y_pred_proba['Real outcome'], 1, 0)
y_pred_proba['Accurate pred.'] = accurate_pred
y_pred_proba.loc[100:150].head(20)

Inexact prediction:

In [None]:
inexact = y_pred_proba[y_pred_proba['Accurate pred.'] == 0]
inexact

Getting students description with the index

In [None]:
df2.iloc[inexact.index.to_list()]

In [None]:
(y_pred_proba['Accurate pred.'].value_counts(1))

In [None]:
accuracy = (y_pred_proba['Accurate pred.'].value_counts(1))[1]
print(f'Accuracy for the testing dataset: {accuracy * 100}%')

### Results for testing dataset :

Accuracy: 99.0%   
excluded_features : "Arithmancy", "Defense Against the Dark Arts", "Care of Magical Creatures"
learning_rate = 0.1
epochs=1000

### Improvements

#### Decision Boundary:
Classification of our features by selecting probabilities above 0.5
If hθ(x) ≥ 0.5 → y=1 and If hθ(x) < 0.5 → y=0

In [None]:
threshold = 0.5
y_pred_proba = predict_proba(df_test_std, model_weights)
above_threshold = (y_pred_proba.max(axis=1) >= threshold)
(above_threshold.value_counts(1))