Link: https://www.kaggle.com/datasets/spscientist/students-performance-in-exams?resource=download

In [21]:
import numpy as np
import pandas as pd

df = pd.read_csv('DataSets\stud_perf_math_reading_writing.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


turn categorical attributes into numerical attributes


In [22]:
df = df[[c for c in df if c not in ['math score']] + ['math score']]

ord_education = ['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"]
for num, level in enumerate(ord_education):
    df.replace(to_replace=level, value=num, inplace=True)

feats_to_one_hot_encode = ['gender', 'race/ethnicity', 'lunch', 'test preparation course']
df = pd.get_dummies(df, prefix=feats_to_one_hot_encode)

feats_to_standardize = ['parental level of education', 'reading score', 'writing score']
for feat in feats_to_standardize:
    df[feat] = (df[feat] - df[feat].mean()) / df[feat].std()

features = df.iloc[:, :df.shape[1] - 1].values
labels = df.loc[:, 'math score'].values

In [23]:
def generateRandom(data, features, labels):
  perm_idx = np.random.permutation(data.shape[0])
  vali_num = int(data.shape[0] * 0.2)
  vali_idx = perm_idx[:vali_num]
  train_idx = perm_idx[vali_num:]
  train_features = features[train_idx, :]
  train_labels = labels[train_idx]
  vali_features = features[vali_idx, :]
  vali_labels = labels[vali_idx]
  return train_features, train_labels, vali_features, vali_labels

train_features, train_labels, vali_features, vali_labels = generateRandom(df, features, labels)
print(train_features.shape)
print(train_labels.shape)
print(vali_features.shape)
print(vali_labels.shape)

(800, 14)
(800,)
(200, 14)
(200,)


In [24]:
def KNN(train_features, train_labels, test_features, k=10):
    vali_pred = []
    for i in range(test_features.shape[0]):
        x = test_features[i, :]  
        distances = np.sqrt(np.sum((x - train_features) ** 2, axis=1))
        topk_idx = np.argpartition(distances, k)[:k]
        topk_labels = list(train_labels[topk_idx])
        pred = max(topk_labels, key=topk_labels.count)
        vali_pred.append(pred)
    return np.array(vali_pred)
for k_tuner in range(5, 16):
    pred = KNN(train_features, train_labels, vali_features, k=k_tuner)

    count = 0
    margin = 1
    for i in range(pred.size):
        if vali_labels[i] - margin <= pred[i] and pred[i] <= vali_labels[i] + margin:
            count += 1

    print("K: {}, Accuracy: {}".format(k_tuner, str(count / vali_features.shape[0])))

K: 5, Accuracy: 0.985
K: 6, Accuracy: 0.985
K: 7, Accuracy: 0.98
K: 8, Accuracy: 0.98
K: 9, Accuracy: 0.975
K: 10, Accuracy: 0.975
K: 11, Accuracy: 0.975
K: 12, Accuracy: 0.975
K: 13, Accuracy: 0.975
K: 14, Accuracy: 0.97
K: 15, Accuracy: 0.97
