# Student Alcohol Consumption
Link: https://www.kaggle.com/datasets/uciml/student-alcohol-consumption

In [49]:
import numpy as np
import pandas as pd

# There is two data sets, one for math grades, the other for portuguese
math_df = pd.read_csv('Datasets/Student Alcohol Consumption/student-mat.csv')
por_df =  pd.read_csv('Datasets/Student Alcohol Consumption/student-por.csv')

# print(np.shape(math_df))        # Shape [395, 33]
# print(np.shape(portuguese_df))  # Shape [649, 33]

The main goal here is to predict the student's grades (both math and portuguese) using relevant features.

Need to parse the data set so the values can be used (i.e. yes/no should be changed to 1/0).

In [50]:
# Convert all binary features into 1/0

# Moving around the columns for the grade results [G1, G2, G3] to the end of the data frame so it's easier to process.
# cols = math_df.columns.tolist()
# cols = cols[:26] + cols[29:] + cols[26:29]
# math_df = math_df[cols]
math_df = math_df[[c for c in math_df if c not in ['G1', 'G2', 'G3']] + ['G1', 'G2', 'G3']]
print(math_df.columns.tolist())
# print(math_df.iloc[0, :])

feat_to_one_hot_encode = ['school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic']

math_df = pd.get_dummies(math_df, prefix=feat_to_one_hot_encode)

#standardize certain columns
feat_to_standardize = ['age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences']
for feat in feat_to_standardize:
    math_df[feat] = (math_df[feat] - math_df[feat].mean()) / math_df[feat].std()

# Turning them into ndarrays
math_features = math_df.iloc[:, :math_df.shape[1] - 3].values
math_labels = math_df.loc[:, 'G3'].values # Chosed G3 because it's the final grade for the subject, but adding in the others isn't difficult
# print(math_df.shape)
# print(math_features.shape)
# print(math_labels.shape)

['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu', 'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime', 'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health', 'absences', 'G1', 'G2', 'G3']


In [51]:
# Given a data frame, generate the training/validation set's features and labels
# The training/validation set are ndarrays, not a data frame since I'm trying to make it work with the KNN function beyond this step
def generateRandom(data, features, labels):
  perm_idx = np.random.permutation(data.shape[0])
  vali_num = int(data.shape[0] * 0.2)
  vali_idx = perm_idx[:vali_num]
  train_idx = perm_idx[vali_num:]
  train_features = math_features[train_idx, :]
  train_labels = math_labels[train_idx]
  vali_features = math_features[vali_idx, :]
  vali_labels = math_labels[vali_idx]
  return train_features, train_labels, vali_features, vali_labels

math_train_features, math_train_labels, math_vali_features, math_vali_labels = generateRandom(math_df, math_features, math_labels)
# print(math_train_features.shape)
# print(math_vali_features.shape)

In [52]:
def KNN(train_features, train_labels, test_features, k=10):
    vali_pred = []
    for i in range(test_features.shape[0]):
        x = test_features[i, :]  
        distances = np.sqrt(np.sum((x - train_features) ** 2, axis=1))
        topk_idx = np.argpartition(distances, k)[:k]
        topk_labels = list(train_labels[topk_idx])
        pred = max(topk_labels, key=topk_labels.count)
        vali_pred.append(pred)
    return np.array(vali_pred)

# Grabbing predictions
math_vali_pred = KNN(math_train_features, math_train_labels, math_vali_features, k=10)

# Some calculations for accuracy
count = 0
margin = 1  # For when the guess is close, but not quite exact
for i in range(math_vali_pred.size):
    # print(str(math_vali_pred[i]) + ' ' +str(math_vali_labels[i]))
    if math_vali_pred[i] + margin == math_vali_labels[i] or math_vali_pred[i] - margin == math_vali_labels[i] or math_vali_pred[i] == math_vali_labels[i]:
        count += 1

print("Accuracy: " + str(count / math_vali_features.shape[0]))


Accuracy: 0.9746835443037974


Accuracy has been improved to > 90% by standardizing certain columns and performing one-hot encoding