In [80]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import pandas as pd
import math
import pickle
import operator
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from sklearn.model_selection import KFold
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU, Masking
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from keras.metrics import AUC
from keras.backend import clear_session
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

drive.mount('/content/drive', force_remount=True)
#drive.mount('/content/drive')


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
#BASE_DIR = ''

DATASET = "akribian"
# DATASET = "assistments"

# Training parameters
TIME_STEPS = 25
BATCH_SIZE = 128
EPOCHS = 4
PADDING = 0
EMBEDDINGS_DIM = 5
NUMBER_OF_FEATURES = 3
LSTM_UNITS = 400

# Cross validation
K = 5

# File paths 
INPUT_PATH = "inputs.npy" if DATASET == "akribian" else "inputs-assistments.npy"
OUTPUT_PATH = "outputs.npy" if DATASET == "akribian" else "outputs-assistments.npy"

Mounted at /content/drive


In [81]:
embeddings_file = open(BASE_DIR + "category_to_embeddings.pkl", "rb")
category_to_embeddings = pickle.load(embeddings_file)
print(category_to_embeddings)
embeddings_file.close()

{'PADDING': array([ 1.06880248, -0.71511453,  0.98465462, -0.34064778,  2.39971325]), 'M04T03: Math Facts (combined FF 4) - Fluency A': array([-0.32443767, -0.24923709,  0.1139133 , -0.05560679,  0.63169662]), 'M04T11: Fluency C - Missing Number (combined FF 6)': array([ 0.85989046,  0.97292641,  0.41904727, -1.32003558, -0.10584627]), 'M02T01: Drag and Drop Speed Assessment 30s Day 3': array([ 0.78911105, -1.3202608 , -2.01094419, -1.6793465 , -1.64414388]), 'M02T01: Drag and Drop Speed Assessment 15s Day 1': array([-1.48371336,  0.29315825, -1.58222845,  1.11229394, -2.14330785]), 'M04T08: Fluency B - Math Facts (x+2, 2+x, x-2 combined)': array([-0.23327135,  2.42320814, -1.01254494,  0.81883576, -2.03578465]), 'M04T08: x_2=y': array([ 1.64906557, -1.29154834, -0.3173267 , -0.5804502 , -1.09192761]), 'M04T11: Introduction to FF 066': array([ 1.30670874,  0.28769286, -0.9855363 , -2.27348637, -1.91593878]), 'M02T02: GT - See equal nums/Select equal sign (3 choices)': array([-1.7879440

In [82]:
X = None
model = load_model(BASE_DIR + "model-" + DATASET)
with open(BASE_DIR + INPUT_PATH, 'rb') as f:
    X = np.load(f)
idx = np.random.choice(len(X))
x = X[idx]
model.summary()
print(x[-1])
x.shape

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking_1 (Masking)          (None, 25, 13)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 400)               662400    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 401       
Total params: 662,801
Trainable params: 662,801
Non-trainable params: 0
_________________________________________________________________
[-1.13480097 -0.11385855 -0.52001148 -0.80132933  0.6108568  -1.13480097
 -0.11385855 -0.52001148 -0.80132933  0.6108568   1.          0.34244367
  1.        ]


(25, 13)

In [83]:
categories = category_to_embeddings.keys()
index_to_category = {idx:name for idx, name in enumerate(categories)}
category_to_index = {name:idx for idx, name in enumerate(categories)}
X = np.ndarray((len(categories), x.shape[0], x.shape[1]))
for idx, category in enumerate(categories):
  next_embeddings = category_to_embeddings[category]
  X[idx, :, :] = x
  X[idx, -1, EMBEDDINGS_DIM : EMBEDDINGS_DIM * 2] = next_embeddings

In [84]:
predictions = model.predict(X)

In [85]:
def find_category_from_embeddings(embeddings):
  best_difference = float("inf")
  best_category = "Unkown"
  for category in categories:
    diff = norm(embeddings - category_to_embeddings[category])
    if diff < best_difference:
      best_difference = diff
      best_category = category
  return best_category

In [86]:
preds_with_indices = list(enumerate(predictions[:,0].tolist()))
preds_with_indices.sort(key=operator.itemgetter(1))
preds_with_indices = list(preds_with_indices)
worst = preds_with_indices[:5]
best = preds_with_indices[-5:]
best.reverse()
worst_names = list(map(lambda x: index_to_category[x[0]], worst))
best_names = list(map(lambda x: index_to_category[x[0]], best))
print("Current category: " + find_category_from_embeddings(x[-1, :EMBEDDINGS_DIM]))
print("\nBest categories:")
for name in best_names:
  print(name)
print("\nWorst categories:")
for name in worst_names:
  print(name)

Current category: M04T03: Math Facts (combined FF 4) - Fluency B

Best categories:
M04T08: GT x+_=y Activity Intro
M04T11: Introduction to FF 156 Activity Intro
M04T03: Missing Number (FF 044) - Fluency A
M04T08: GT x-_=y
M04T11: GT - Missing Number (FF 336)

Worst categories:
M02T03: AI - Find the NOT equal number
M04T09: Introduction to FF 224 Activity Intro
M04T10: Fluency B - Math Facts (combined FF 5)
M04T11: Introduction to FF 336
M04T05: GT - Add or Take Away
