Since we are trying to predict four different groups:

3: the assessment was solved on the first attempt

2: the assessment was solved on the second attempt

1: the assessment was solved after 3 or more attempts

0: the assessment was never solved

The baseline model will be appropriate to try logistic regression

In [0]:
import seaborn as sns
import numpy as np
import pandas as pd

# SK-learn libraries for learning.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [8]:
# load data from github
url = 'https://raw.githubusercontent.com/skylerroh/kaggle-educational-app-predictions/skyler-initial-feature-extraction/installation_features.csv?token=AM75BKLOXDWECHSD56UCWK26MW3WK'
df = pd.read_csv(url)

df.head()

Unnamed: 0,installation_id,game_session_y,event_count,game_time,num_unique_days,elapsed_days,last_world,unique_game_sessions,title_y,num_correct,num_incorrect,accuracy,accuracy_group,activity_ct,game_ct,assessment_ct,clip_ct
0,0006a69f,77b8ee947eb84b4e,80570,75295894,1,0 days 00:00:00.000000000,TREETOPCITY,26,Bird Measurer (Assessment),0,11,0.0,0,556.0,525.0,135.0,14.0
1,0006a69f,9501794defd84e4d,135226,196920049,1,0 days 00:00:00.000000000,TREETOPCITY,48,Mushroom Sorter (Assessment),1,1,0.5,2,1103.0,845.0,229.0,24.0
2,0006c192,197a373a77101924,141995,140663742,2,1 days 00:00:00.000000000,MAGMAPEAK,26,Cauldron Filler (Assessment),1,0,1.0,3,894.0,633.0,24.0,15.0
3,0006c192,957406a905d59afd,195450,201045937,4,18 days 00:00:00.000000000,TREETOPCITY,48,Bird Measurer (Assessment),1,1,0.5,2,1206.0,643.0,343.0,30.0
4,0006c192,b2297d292892745a,175327,179033827,4,18 days 00:00:00.000000000,TREETOPCITY,44,Mushroom Sorter (Assessment),0,4,0.0,0,1206.0,637.0,143.0,28.0


In [24]:
df.shape

(9386, 17)

In [62]:
# Split them into development and trainning dataset
features = ['game_time', 'unique_game_sessions', 'activity_ct', 'game_ct', 'assessment_ct', 'clip_ct']
X = df[features].to_numpy()
Y = df[['accuracy_group']].to_numpy().flatten()

# Shuffle the data, but make sure that the features and accompanying labels stay in sync.
np.random.seed(0)
shuffle = np.random.permutation(np.arange(X.shape[0]))
X, Y = X[shuffle], Y[shuffle]
# Convert NaN values to 0.
X = np.nan_to_num(X)

# Split into train and dev.
num_dev = int(len(Y)*0.1)
train_data, train_labels = X[num_dev:], Y[num_dev:]
dev_data, dev_labels = X[:num_dev], Y[:num_dev]

print ('Feature names:', features)
print('training label shape:', train_labels.shape)
print('dev label shape:', dev_labels.shape)
print('labels names:', np.unique(Y))

Feature names: ['game_time', 'unique_game_sessions', 'activity_ct', 'game_ct', 'assessment_ct', 'clip_ct']
training label shape: (8448,)
dev label shape: (938,)
labels names: [0 1 2 3]


In [63]:
train_data

array([[8.31179740e+07, 1.00000000e+01, 2.86000000e+02, 1.30000000e+02,
        4.90000000e+01, 6.00000000e+00],
       [1.90716868e+08, 4.70000000e+01, 9.68000000e+02, 6.41000000e+02,
        1.88000000e+02, 2.20000000e+01],
       [1.79541306e+09, 1.31000000e+02, 2.15300000e+03, 2.57500000e+03,
        1.28700000e+03, 5.40000000e+01],
       ...,
       [9.74151020e+07, 1.50000000e+01, 4.53000000e+02, 3.95000000e+02,
        1.48000000e+02, 6.00000000e+00],
       [1.11226860e+07, 6.00000000e+00, 4.30000000e+01, 0.00000000e+00,
        1.48000000e+02, 4.00000000e+00],
       [1.00755022e+08, 2.00000000e+01, 4.32000000e+02, 5.81000000e+02,
        1.28000000e+02, 9.00000000e+00]])

In [61]:
# Test different C values for Logistic Regression Model
candidate_c = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5,
               1, 5, 10, 50, 100, 1000]

best_f1 = -1
best_c = -1
for c in candidate_c:
    lr_model = LogisticRegression(penalty="l2",
                                  C = c,
                                  solver="liblinear",
                                  multi_class="auto",
                                  max_iter = 1000)
    lr_model.fit(train_data, train_labels)
    lr_predict_labels = lr_model.predict(dev_data)
    f1_score = metrics.f1_score(dev_labels, lr_predict_labels, average="weighted")
    print("C = %f, F1-score=%f:" % (c, f1_score))
    for class_index in range(lr_model.coef_.shape[0]):
        squared_sum = np.sum(np.square(lr_model.coef_[class_index]))
        print("\t for class: %d (%s), sum of squared weight: %f" %
             (class_index, np.unique(Y)[class_index], squared_sum)) 
    if f1_score > best_f1:
        best_f1 = f1_score
        best_c = c

print("Optimal value for C in Logistic Regression: %f" % best_c)
print("F1 score of Logistic Regression: %f" % best_f1)

C = 0.000100, F1-score=0.158655:
	 for class: 0 (0), sum of squared weight: 0.000000
	 for class: 1 (1), sum of squared weight: 0.000000
	 for class: 2 (2), sum of squared weight: 0.000000
	 for class: 3 (3), sum of squared weight: 0.000000
C = 0.001000, F1-score=0.158655:
	 for class: 0 (0), sum of squared weight: 0.000000
	 for class: 1 (1), sum of squared weight: 0.000000
	 for class: 2 (2), sum of squared weight: 0.000000
	 for class: 3 (3), sum of squared weight: 0.000000
C = 0.010000, F1-score=0.158655:
	 for class: 0 (0), sum of squared weight: 0.000000
	 for class: 1 (1), sum of squared weight: 0.000000
	 for class: 2 (2), sum of squared weight: 0.000000
	 for class: 3 (3), sum of squared weight: 0.000000
C = 0.050000, F1-score=0.158655:
	 for class: 0 (0), sum of squared weight: 0.000000
	 for class: 1 (1), sum of squared weight: 0.000000
	 for class: 2 (2), sum of squared weight: 0.000000
	 for class: 3 (3), sum of squared weight: 0.000000
C = 0.100000, F1-score=0.158655:
	 f

In [67]:
  # KNN Method
  k_value = [k for k in range(1, 15, 2)]
  f1_k = []
  for k in k_value:
    model1 = KNeighborsClassifier(n_neighbors=k)
    model1.fit(train_data, train_labels)
    p_label_1 = model1.predict(dev_data)
    accuracy_k = model1.score(dev_data, dev_labels)
    score1 = metrics.f1_score(dev_labels, p_label_1, average = "weighted")
    f1_k.append(score1)
    print("k value : %2d, F1 score: %.2f" % (k, score1))
  # Get the optimized f1 score and k
  max_f1_k = max(f1_k)
  max_index1 = f1_k.index(max_f1_k)
  print("The optimal k value: %2d, and optimal F1 score: %.2f" % (k_value[max_index1], max_f1_k), "\n")

k value :  1, F1 score: 0.32
k value :  3, F1 score: 0.35
k value :  5, F1 score: 0.35
k value :  7, F1 score: 0.35
k value :  9, F1 score: 0.35
k value : 11, F1 score: 0.35
k value : 13, F1 score: 0.36
The optimal k value: 13, and optimal F1 score: 0.36 



In [68]:
  # NB Method
  alpha = [10**x for x in range(-3,2)]
  f1_nb = []
  for a in alpha:
      model2 = MultinomialNB(alpha=a)
      model2.fit(train_data, train_labels)
      p_label_2 = model2.predict(dev_data)
      score2 = metrics.f1_score(dev_labels,p_label_2,average="weighted")
      f1_nb.append(score2)
      print("alpha value : %.3f, F1 score: %.2f" % (a, score2))
  # Get the optimized f1 score and alpha
  max_f1_nb = max(f1_nb)
  max_index2 = f1_nb.index(max_f1_nb)
  print("The optimal alpha value: %.3f, and optimal F1 score: %.2f"% (alpha[max_index2], max_f1_nb), '\n')

alpha value : 0.001, F1 score: 0.34
alpha value : 0.010, F1 score: 0.34
alpha value : 0.100, F1 score: 0.34
alpha value : 1.000, F1 score: 0.34
alpha value : 10.000, F1 score: 0.34
The optimal alpha value: 0.001, and optimal F1 score: 0.34 

