In [118]:
import numpy as np
import pandas as pd

df = pd.read_csv('DataSets/student_prediction.csv')
df.head()

Unnamed: 0,STUDENTID,AGE,GENDER,HS_TYPE,SCHOLARSHIP,WORK,ACTIVITY,PARTNER,SALARY,TRANSPORT,...,PREP_STUDY,PREP_EXAM,NOTES,LISTENS,LIKES_DISCUSS,CLASSROOM,CUML_GPA,EXP_GPA,COURSE ID,GRADE
0,STUDENT1,2,2,3,3,1,2,2,1,1,...,1,1,3,2,1,2,1,1,1,1
1,STUDENT2,2,2,3,3,1,2,2,1,1,...,1,1,3,2,3,2,2,3,1,1
2,STUDENT3,2,2,2,3,2,2,2,2,4,...,1,1,2,2,1,1,2,2,1,1
3,STUDENT4,1,1,1,3,1,2,1,2,1,...,1,2,3,2,2,1,3,2,1,1
4,STUDENT5,2,2,1,3,2,2,1,3,1,...,2,1,2,2,2,1,2,2,1,1


In [119]:
# remove unnecessary columns
feats_to_drop = ['STUDENTID', 'COURSE ID', 'CLASSROOM', 'KIDS', 'MOTHER_JOB', 'FATHER_JOB', '#_SIBLINGS', 'LIVING', 'TRANSPORT']
df = df.drop(feats_to_drop, axis=1)
# one-hot encode certain features
feats_to_one_hot_encode = ['GENDER', 'HS_TYPE', 'ACTIVITY', 'PARTNER', 'MOTHER_EDU', 'FATHER_EDU']
df = pd.get_dummies(df, columns=feats_to_one_hot_encode)
# standardize feats
feats_to_standardize = [feat for feat in df.columns.tolist() if (feat not in feats_to_one_hot_encode and feat not in ['GRADE'])]
print(feats_to_standardize)
for feat in feats_to_standardize:
    df[feat] = (df[feat] - df[feat].mean()) / df[feat].std()
# split dataframe into numpy array of features and values
features = df.iloc[:, :df.shape[1] - 1].values
labels = df.loc[:, 'GRADE'].values
print(np.unique(labels))

['AGE', 'SCHOLARSHIP', 'WORK', 'SALARY', 'STUDY_HRS', 'READ_FREQ', 'READ_FREQ_SCI', 'ATTEND_DEPT', 'IMPACT', 'ATTEND', 'PREP_STUDY', 'PREP_EXAM', 'NOTES', 'LISTENS', 'LIKES_DISCUSS', 'CUML_GPA', 'EXP_GPA', 'GENDER_1', 'GENDER_2', 'HS_TYPE_1', 'HS_TYPE_2', 'HS_TYPE_3', 'ACTIVITY_1', 'ACTIVITY_2', 'PARTNER_1', 'PARTNER_2', 'MOTHER_EDU_1', 'MOTHER_EDU_2', 'MOTHER_EDU_3', 'MOTHER_EDU_4', 'MOTHER_EDU_5', 'MOTHER_EDU_6', 'FATHER_EDU_1', 'FATHER_EDU_2', 'FATHER_EDU_3', 'FATHER_EDU_4', 'FATHER_EDU_5', 'FATHER_EDU_6']
[0 1 2 3 4 5 6 7]


In [120]:
def generateRandom(data, features, labels):
  perm_idx = np.random.permutation(data.shape[0])
  vali_num = int(data.shape[0] * 0.2)
  vali_idx = perm_idx[:vali_num]
  train_idx = perm_idx[vali_num:]
  train_features = features[train_idx, :]
  train_labels = labels[train_idx]
  vali_features = features[vali_idx, :]
  vali_labels = labels[vali_idx]
  return train_features, train_labels, vali_features, vali_labels

train_features, train_labels, vali_features, vali_labels = generateRandom(df, features, labels)
print(train_features.shape)
print(train_labels.shape)
print(vali_features.shape)
print(vali_labels.shape)
print(np.unique(vali_labels))

(116, 38)
(116,)
(29, 38)
(29,)
[0 1 2 3 4 5 6 7]


In [121]:
def KNN(train_features, train_labels, test_features, k=10):
    vali_pred = []
    for i in range(test_features.shape[0]):
        x = test_features[i, :]  
        distances = np.sqrt(np.sum((x - train_features) ** 2, axis=1))
        topk_idx = np.argpartition(distances, k)[:k]
        topk_labels = list(train_labels[topk_idx])
        pred = max(topk_labels, key=topk_labels.count)
        vali_pred.append(pred)
    return np.array(vali_pred)
for k_tuner in range(5, 16):
    pred = KNN(train_features, train_labels, vali_features, k=k_tuner)
    count = 0
    margin = 1
    for i in range(pred.size):
        if vali_labels[i] - margin <= pred[i] and pred[i] <= vali_labels[i] + margin:
            count += 1

    print("K: {}, Accuracy: {}".format(k_tuner, str(count / vali_features.shape[0])))

K: 5, Accuracy: 0.7931034482758621
K: 6, Accuracy: 0.7931034482758621
K: 7, Accuracy: 0.7931034482758621
K: 8, Accuracy: 0.7931034482758621
K: 9, Accuracy: 0.8275862068965517
K: 10, Accuracy: 0.8275862068965517
K: 11, Accuracy: 0.8620689655172413
K: 12, Accuracy: 0.8275862068965517
K: 13, Accuracy: 0.7931034482758621
K: 14, Accuracy: 0.8275862068965517
K: 15, Accuracy: 0.7586206896551724
