In [None]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import pickle
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
drive.mount('/content/drive', force_remount=True)


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
INPUT_DIR = BASE_DIR + 'data/akribian/raw/'
OUTPUT_DIR = BASE_DIR + 'data/akribian/processed/dkt/'
#BASE_DIR = ''

TIME_STEPS = 25
STEP_SHIFT = 1
BATCH_SIZE = 128
EPOCHS = 10
PADDING = 0
EMBEDDINGS_DIM = 5
NUMBER_OF_FEATURES = 3
VALIDATION_RATIO = 0.3

Mounted at /content/drive


## Load the data

In [None]:
# Remember that DataFrames are immutable
data = pd.read_csv(INPUT_DIR + "rawdata.csv")
columns_to_drop = ['StudentName', 'TrainingPlanId', 'TrainingPlanName',
       'LearningModuleId', 'LearningModuleName', 'LearningTaskId',
       'LearningTaskTitle', 'LearningSequenceId',
       'ExerciseId', 'ExerciseTitle', 
       'SubmissionAnswers', 'SubmissionResponseTimes', 'ExerciseOutcome',
       'FinishedOn', 'ExerciseResultId',
       'LearningSequenceSessionId']
data = data.drop(columns_to_drop, axis='columns')
data.columns

Index(['StudentId', 'LearningSequenceTitle', 'StudentSessionId',
       'SubmissionOutcomes', 'ExerciseResponseTime'],
      dtype='object')

## Create title to index dictionary

In [None]:
categories = ['PADDING'] + list(set(data['LearningSequenceTitle']))
category_to_embeddings = {}

# Compress using random embeddings (compressed sensing)
for category in categories:
    random_array = np.random.normal(0.0, 1.0, EMBEDDINGS_DIM)
    category_to_embeddings[category] = random_array

In [None]:
embeddings_file = open(BASE_DIR + "category_to_embeddings.pkl", "wb")
pickle.dump(category_to_embeddings, embeddings_file)
embeddings_file.close()

## Calculate z-score for response time 

In [None]:
category_means = {}
category_stds = {}

for category in categories:
  category_samples = data[ lambda x: x["LearningSequenceTitle"] == category ]

  mean = category_samples["ExerciseResponseTime"].mean()
  if math.isnan(mean):
    mean = 0.
  category_means[category] = mean

  std = category_samples["ExerciseResponseTime"].std()
  if math.isnan(std):
    std = 1.
  if std == 0:
    std = 1.
  category_stds[category] = std  


## Turn categories into embeddings

In [None]:
data["NumberOfAttempts"] = ""
def process_row(row):
    # Calculate z-score for response time
    mean = category_means[row['LearningSequenceTitle']]
    std = category_stds[row['LearningSequenceTitle']]
    row['ExerciseResponseTime'] = ( row['ExerciseResponseTime'] - mean ) / std 

    # Turn category into an embedding
    row['LearningSequenceTitle'] = category_to_embeddings[row['LearningSequenceTitle']]

    # Extract number of attempts
    # 1 means first try, 2 means 2 tries, 3 means 3 or more
    number_of_attempts = 1 + row['SubmissionOutcomes'].count(',')

    if number_of_attempts > 3:
      number_of_attempts = 3

    row['NumberOfAttempts'] = number_of_attempts

    # Use only first outcome
    row['SubmissionOutcomes'] = int(row['SubmissionOutcomes'][0])
    return row
data = data.apply(process_row, axis=1)

In [None]:
series = data.drop(['StudentSessionId'],axis='columns').groupby(['StudentId']).agg({lambda x: list(x)})
print("Number of users: %i" % len(series))

Number of users: 112


In [None]:
series_np = series.to_numpy()

In [None]:
def convert_series_to_data(series):
    # Extract exercises
    exercises = np.array(series[0])
    exercises_shifted = np.roll(exercises, -1, axis=0)
    # Set the embeddings of the last exercise to the 'padding' embeddings
    exercises_shifted[-1,:] = category_to_embeddings['PADDING']


    # Extract answers
    answers = np.array(series[1])

    # Prepare future answers vector (used as label)
    # Answer labels are {0, 1} which corresponds to probability of correctness
    answers_shifted = np.roll(answers, -1)
    answers_shifted[answers_shifted > 1.0] = 1.0
    answers_shifted = 1.0 - answers_shifted


    # Answer features should be {1, 2, 3} with 0 reserved for padding
    answers += 1
    response_times = np.array(series[2])
    number_of_attempts = np.array(series[3])
        
    # Pad the sequences
    pad_length = TIME_STEPS - len(exercises)
    if (pad_length < 0): pad_length = 0

    padded_answers_shifted = np.pad(answers_shifted, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))

    # Reshape arrays so they can be concatenated
    answers = answers.reshape((len(answers), 1))
    response_times = response_times.reshape((len(response_times), 1))
    number_of_attempts = number_of_attempts.reshape((len(number_of_attempts), 1))

    # Concatenate features vector
    features = np.concatenate((exercises, exercises_shifted, answers, response_times, number_of_attempts), axis=1)
    padded_features = np.pad(features, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))
    padded_features = padded_features[:, pad_length:]

    labels = padded_answers_shifted
    
    # Drop entries to the left so we have a discrete number of shifts
    number_of_samples = len(padded_features)
    entries_to_drop = (number_of_samples - TIME_STEPS) % STEP_SHIFT
    features = padded_features[entries_to_drop:, :]
    labels = labels[entries_to_drop:]
    
    number_of_sequences = int(math.floor((number_of_samples - TIME_STEPS) / STEP_SHIFT + 1))
    
    final_features = np.ndarray((number_of_sequences, TIME_STEPS, EMBEDDINGS_DIM * 2 + NUMBER_OF_FEATURES))
    final_labels = np.ndarray((number_of_sequences, 1))
    
    for i in range(number_of_sequences):
        final_features[i, :, :] = features[i * STEP_SHIFT : i * STEP_SHIFT + TIME_STEPS, :]
        final_labels[i, :] = labels[TIME_STEPS - 1 + i * STEP_SHIFT]

    return final_features, final_labels

In [None]:
samples = len(series_np)
inputs = None
outputs = None
for idx, series in enumerate(series_np):
    input_array, output_array = convert_series_to_data(series)
    if idx == 0: 
        inputs = input_array
        outputs = output_array
        continue
    inputs = np.append(inputs, input_array, axis=0)
    outputs = np.append(outputs, output_array)

In [None]:
with open(OUTPUT_DIR + 'inputs.npy','wb') as file:
    np.save(file, inputs)
with open(OUTPUT_DIR + 'outputs.npy','wb') as file:
    np.save(file, outputs)

In [None]:
np.isnan(inputs).any()

False