In [25]:
import numpy as np
import pandas as pd
import math
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
from scipy.stats import binom

In [26]:
binary = ["school", "sex", "address", "famsize", "Pstatus", "schoolsup", "famsup", "paid", "activities", "nursery", "higher", "internet", "romantic"]
nominal = ["Mjob", "Fjob", "reason", "guardian", "age", "Medu", "Fedu", "traveltime", "studytime", "failures", "famrel", "freetime", "goout", "Dalc", "Walc", "health"]

In [27]:
df = pd.read_csv("student/student-mat.csv", delimiter=';', header=0)
df = df.drop("G1", axis=1)
df = df.drop("G2", axis=1)
df = df.drop("absences", axis=1)

df.loc[df["G3"] <= 12, "G3"] = 1
df.loc[df["G3"] > 12, "G3"] = 0

for label in binary:
    df[label] = df[label].replace(to_replace=df[label].iloc[0],value=0)
    for j in range(len(df[label])):
        if(df[label].iloc[j] != df[label].iloc[0]):
            df[label] = df[label].replace(to_replace=df[label].iloc[j], value=1)

df.head(5)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,G3
0,0,0,18,0,0,0,4,4,at_home,teacher,...,0,0,0,4,3,4,1,1,3,1
1,0,0,17,0,0,1,1,1,at_home,other,...,0,1,0,5,3,3,1,1,3,1
2,0,0,15,0,1,1,1,1,at_home,other,...,0,1,0,4,3,2,2,3,3,1
3,0,0,15,0,0,1,4,2,health,services,...,0,1,1,3,2,2,1,1,5,0
4,0,0,16,0,0,1,3,3,other,other,...,0,0,0,4,3,2,1,2,5,1


In [28]:
train, test = train_test_split(df, test_size=0.15)

In [29]:
def count_occurences(greater_than, less_than, data_type):
    great = {}
    less = {}
    for label in data_type:
        great[label] = greater_than[label].value_counts()
        less[label] = less_than[label].value_counts()
    return great, less

In [30]:
def calculate_probabilities(greater_than, less_than, data_type):
    G_, L_ = count_occurences(greater_than, less_than, data_type)
    greater_prob = {}
    less_prob = {}
    for label in data_type:
        g = {}
        l = {}

        for i in range(len(G_[label])):
            g[G_[label].index.values[i]] = (G_[label].iloc[i]/G_[label].sum())
        for i in range(len(L_[label])):
            l[L_[label].index.values[i]] = (L_[label].iloc[i]/L_[label].sum())
        greater_prob[label] = g
        less_prob[label] = l
    return greater_prob, less_prob

In [31]:
def split_by_val(x):
    greater_than = x[x.G3 == 0]
    less_than = x[x.G3 == 1]
    return greater_than, less_than, len(greater_than), len(less_than)

In [32]:
training_sets = []
testing_sets = []
counted_sets = []
accuracy = []

for i in range(10):
    train_fold, test_fold = train_test_split(train, test_size=0.15)

    training_sets.append(train_fold)
    testing_sets.append(test_fold)
    
    class_G, class_L, count_G, count_L = split_by_val(train_fold)
    
    nominal_G, nominal_L = calculate_probabilities(class_G, class_L, nominal)
    
    binary_G, binary_L = calculate_probabilities(class_G, class_L, binary)
    
    p0 = count_G / (count_G + count_L)
    p1 = 1 - p0

    class_G = np.zeros(len(test_fold)) + math.log(p0)
    class_L = np.zeros(len(test_fold)) + math.log(p1)
    
    for label in binary:
        binary_vector = []
        for i in range(len(test_fold[label])):
            if test_fold[label].iloc[i] not in binary_G[label]:
                binary_vector.append(0)
            else:
                binary_vector.append(math.log(binary_G[label][test_fold[label].iloc[i]]))
        class_G += binary_vector
        binary_vector = []
        for i in range(len(test_fold[label])):
            if test_fold[label].iloc[i] not in binary_L[label]:
                binary_vector.append(0)
            else:
                binary_vector.append(math.log(binary_L[label][test_fold[label].iloc[i]]))
        class_L += binary_vector
        
                
    for label in nominal:
        nominal_vector = []
        for i in range(len(test_fold[label])):
            if test_fold[label].iloc[i] not in nominal_G[label]:
                nominal_vector.append(0)
            else:
                nominal_vector.append(math.log(nominal_G[label][test_fold[label].iloc[i]]))
        class_G += nominal_vector
        nominal_vector = []
        for i in range(len(test_fold[label])):
            if test_fold[label].iloc[i] not in nominal_L[label]:
                nominal_vector.append(0)
            else:
                nominal_vector.append(math.log(nominal_L[label][test_fold[label].iloc[i]]))
        class_L += nominal_vector
            
    predictions = []
    
    for class_G, class_L in zip(class_G, class_L):
        if class_G > class_L:
            predictions.append(0)
        else:
            predictions.append(1)
            
    correct = 0
    for pred, actual in zip(predictions, test_fold["G3"]):
        if pred == actual:
            correct+=1
    accuracy.append(correct / len(predictions))

In [36]:
for i in range(len(accuracy)):
    print("Accuracy for fold ", i+1, accuracy[i])

Accuracy for fold  1 0.6470588235294118
Accuracy for fold  2 0.7058823529411765
Accuracy for fold  3 0.6862745098039216
Accuracy for fold  4 0.7058823529411765
Accuracy for fold  5 0.7843137254901961
Accuracy for fold  6 0.7254901960784313
Accuracy for fold  7 0.7058823529411765
Accuracy for fold  8 0.8235294117647058
Accuracy for fold  9 0.803921568627451
Accuracy for fold  10 0.5686274509803921


In [34]:
accuracy = np.asarray(accuracy)
mean_testing = accuracy.mean()
std_testing = accuracy.std()
print("Mean accuracy over 10 folds: ", mean_testing)
print("STD of accuracy over 10 folds: ", std_testing)

Mean accuracy over 10 folds:  0.7156862745098038
STD of accuracy over 10 folds:  0.07191027772898921
