# Student Alcohol Consumption
Link: https://www.kaggle.com/datasets/uciml/student-alcohol-consumption

In [27]:
import numpy as np
import pandas as pd

# There is two data sets, one for math grades, the other for portuguese
math_df = pd.read_csv('Datasets/Student Alcohol Consumption/student-mat.csv')
por_df =  pd.read_csv('Datasets/Student Alcohol Consumption/student-por.csv')

# print(np.shape(math_df))        # Shape [395, 33]
# print(np.shape(portuguese_df))  # Shape [649, 33]

The main goal here is to predict the student's grades (both math and portuguese) using relevant features.

Need to parse the data set so the values can be used (i.e. yes/no should be changed to 1/0).

In [28]:
# Moving around the columns for the grade results [G1, G2, G3] to the end of the data frame so it's easier to process.
math_df = math_df[[c for c in math_df if c not in ['G1', 'G2', 'G3']] + ['G1', 'G2', 'G3']]
print(math_df.columns.tolist())
# print(math_df.iloc[0, :])

# Do one-hot encoding on columns populated by strings
feat_to_one_hot_encode = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']
math_df = pd.get_dummies(math_df, prefix=feat_to_one_hot_encode)

# Standardize certain columns
feat_to_standardize = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']
for feat in feat_to_standardize:
    math_df[feat] = (math_df[feat] - math_df[feat].mean()) / math_df[feat].std()

# # Turning them into ndarrays
# math_features = math_df.iloc[:, :math_df.shape[1] - 3].values
# math_labels = math_df.loc[:, 'G3'].values # Chosed G3 because it's the final grade for the subject, but adding in the others isn't difficult
# print(math_df.shape)
# print(math_features.shape)
# print(math_labels.shape)
clone = math_df
clone = clone.drop('G3', axis=1)
math_features = clone.iloc[:].values
math_labels = math_df.loc[:, 'G3'].values

['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [29]:
# Given a data frame, generate the training/validation set's features and labels
# The training/validation set are ndarrays, not a data frame since I'm trying to make it work with the KNN function beyond this step
def generateRandom(data, features, labels):
  perm_idx = np.random.permutation(data.shape[0])
  vali_num = int(data.shape[0] * 0.2)
  vali_idx = perm_idx[:vali_num]
  train_idx = perm_idx[vali_num:]
  train_features = features[train_idx, :]
  train_labels = labels[train_idx]
  vali_features = features[vali_idx, :]
  vali_labels = labels[vali_idx]
  return train_features, train_labels, vali_features, vali_labels

math_train_features, math_train_labels, math_vali_features, math_vali_labels = generateRandom(math_df, math_features, math_labels)
# print(math_train_features.shape)
# print(math_vali_features.shape)

In [30]:
def KNN(train_features, train_labels, test_features, k=10):
    vali_pred = []
    for i in range(test_features.shape[0]):
        x = test_features[i, :]  
        distances = np.sqrt(np.sum((x - train_features) ** 2, axis=1))
        topk_idx = np.argpartition(distances, k)[:k]
        topk_labels = list(train_labels[topk_idx])
        pred = max(topk_labels, key=topk_labels.count)
        vali_pred.append(pred)
    return np.array(vali_pred)

for k_tuner in range(5, 16):
    # Grabbing predictions
    math_vali_pred = KNN(math_train_features, math_train_labels, math_vali_features, k=k_tuner)

    # Some calculations for accuracy
    count = 0
    margin = 1  # For when the guess is close, but not quite exact
    for i in range(math_vali_pred.size):
        # print(str(math_vali_pred[i]) + ' ' +str(math_vali_labels[i]))
        if math_vali_labels[i] - margin <= math_vali_pred[i] and math_vali_pred[i] <= math_vali_labels[i] + margin:
            count += 1

    print("K: {}, Accuracy: {}".format(k_tuner, str(count / math_vali_features.shape[0])))


K: 5, Accuracy: 0.6708860759493671
K: 6, Accuracy: 0.6582278481012658
K: 7, Accuracy: 0.6835443037974683
K: 8, Accuracy: 0.6835443037974683
K: 9, Accuracy: 0.6708860759493671
K: 10, Accuracy: 0.6708860759493671
K: 11, Accuracy: 0.6835443037974683
K: 12, Accuracy: 0.6582278481012658
K: 13, Accuracy: 0.6455696202531646
K: 14, Accuracy: 0.6582278481012658
K: 15, Accuracy: 0.6582278481012658


Accuracy has been improved to > 90% by standardizing certain columns and performing one-hot encoding

In [None]:
# age
# Medu
# Fedu
# traveltime
# studytime

# failures
# famrel
# freetime
# goout
# Dalc

# Walc
# health
# absences
# G1
# G2

# school_GP
# school_MS
# sex_F
# sex_M
# address_R
# address_U
# famsize_GT3
# famsize_LE3
# Pstatus_A
# Pstatus_T
# Mjob_at_home
# Mjob_health
# Mjob_other
# Mjob_services
# Mjob_teacher
# Fjob_at_home
# Fjob_health
# Fjob_other
# Fjob_services
# Fjob_teacher
# reason_course
# reason_home
# reason_other
# reason_reputation
# guardian_father
# guardian_mother
# guardian_other
# schoolsup_no
# schoolsup_yes
# famsup_no
# famsup_yes
# paid_no
# paid_yes
# activities_no
# activities_yes
# nursery_no
# nursery_yes
# higher_no
# higher_yes
# internet_no
# internet_yes
# romantic_no
# romantic_yes

In [31]:
testSample = [[-1.32926782,  1.14240684, -0.47924897, -0.64243471,  -1, #Study Time
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.], 
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  0,
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1,
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149, #Health 
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  -1 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149,
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  0 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149,
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1 , -0.46342827, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149, #Absences 
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , -1, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],
        [-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149,  
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , 0, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],[-1.32926782,  1.14240684, -0.47924897, -0.64243471,  1.14932149,  
       -0.44937373, -1.05313638, -1.23685052, -0.99603207, -0.54001379,
       -1.00251779,  1.0397512 , 1, 15., 14.,
        1.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  1.],]

In [32]:
pred_sample =  KNN(math_train_features, math_train_labels, np.array(testSample), k=k_tuner)
pred_sample

array([15, 14, 14, 14, 15, 14, 14, 14, 15], dtype=int64)