In [39]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import pickle
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from operator import itemgetter
from datetime import datetime
import pytz
drive.mount('/content/drive', force_remount=True)


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
INPUT_DIR = BASE_DIR + 'data/akribian/raw/'
OUTPUT_DIR = BASE_DIR + 'data/akribian/processed/rkt/'

TIME_STEPS = 300
THETA = 0.4

!cp '/content/drive/My Drive/Colab Notebooks/thesis/lib/preprocessing.py' .
from preprocessing import progressBar, process_rkt

Mounted at /content/drive


## Load the data

In [40]:
# Remember that DataFrames are immutable
columns = ['StudentId', 'ExerciseTitle', 'SubmissionOutcomes', 'FinishedOn']
data = pd.read_csv(INPUT_DIR + "rawdata.csv", usecols=columns)
data.columns

Index(['StudentId', 'ExerciseTitle', 'SubmissionOutcomes', 'FinishedOn'], dtype='object')

## Create title to index dictionary

In [41]:
exercises = ['PADDING'] + list(set(data['ExerciseTitle']))
number_of_exercises = len(exercises)
exercise_to_index = {cat:idx for (idx, cat) in enumerate(exercises)}
index_to_exercise = {v:k for (k,v) in exercise_to_index.items()}
print("Number of exercises: %i" % number_of_exercises)

Number of exercises: 787



## Turn exercises into indices

In [42]:
print("Converting dates...")
data['FinishedOn'] = pd.to_datetime(data['FinishedOn'])
print("Done!")
epoch = datetime.utcfromtimestamp(0)
epoch = epoch.replace(tzinfo=pytz.UTC)
def process_row(row):
    # Turn category into an embedding
    row['ExerciseTitle'] = exercise_to_index[row['ExerciseTitle']]
    # Use only first outcome
    row['SubmissionOutcomes'] = int(row['SubmissionOutcomes'][0])
    if row['SubmissionOutcomes'] > 1:
      row['SubmissionOutcomes'] = 1
    # Convert time to numeric for easier storage
    # Hours since epoch
    hours_since_epoch = (row['FinishedOn'] - epoch).total_seconds() / 3600.
    row['FinishedOn'] = hours_since_epoch
    return row
print("Processing rows...")
data = data.apply(process_row, axis=1)
print("Done!")

Converting dates...
Done!
Processing rows...
Done!


In [43]:
series = data.groupby(['StudentId']).agg({lambda x: list(x)})

In [44]:
student_sequences = series.to_numpy()
print("Average exercises per student: ")
np.mean(list(map(len,student_sequences[:,0])))

Average exercises per student: 


618.7053571428571

In [45]:
# contingency table to be later used for calculating relation matrix
contingency_table = np.zeros((number_of_exercises, number_of_exercises, 4))

In [46]:
def update_contingency_matrix(student_data):
  exercises = student_data[0]
  answers = student_data[1]
  for idx_i in range(len(exercises)):
    exercise_i = exercises[idx_i]
    answer_i = answers[idx_i]
    for idx_j in range(idx_i):
      exercise_j = exercises[idx_j]
      answer_j = answers[idx_j]
      # Binary indexing for the third dimension of contingency table
      index_z = 2 * answer_j + answer_i
      contingency_table[exercise_i, exercise_j, index_z] += 1

In [None]:
# series_np is an array of array of lists, each array of lists is a student's 
# entire history, indices are 0:exercise index, 1:correctness, 2:timestamp
# for example series_np[x,1] is student x's correctness values
print("Generating contingency matrix...")
for idx, sequence in enumerate(student_sequences):
  progressBar(idx, len(student_sequences))
  update_contingency_matrix(sequence)
progressBar(1, 1)

Generating contingency matrix...
Progress: [______________🐌     🏠] 76 %

In [None]:
# Relation matrix consists of phi coefficients between exercises
# First dimension is i, second dimension is j
# for n-dimension: n[0] is n00, n[1] is n01, n[2] is n10, n[3] is n11
relation_matrix = np.zeros((number_of_exercises, number_of_exercises))
for idx_i in range(number_of_exercises):
  for idx_j in range(number_of_exercises):
    n = contingency_table[idx_i, idx_j]
    numerator = n[3] * n[0] - n[1] * n[2]
    denominator = math.sqrt((n[0] + n[1]) * (n[2] + n[3]) * (n[0] + n[2]) * (n[1] + n[3]))
    if denominator == 0:
      continue
    relation_matrix[idx_i, idx_j] = numerator/denominator

In [None]:
# Show the figure
plt.figure(1, figsize=(8,8), dpi=100)
plt.imshow(relation_matrix[:200,:200])
plt.colorbar()

# Threshold the relations matrix with theta
relation_matrix[relation_matrix < THETA] = 0.
plt.figure(2, figsize=(8,8), dpi=100)
plt.imshow(relation_matrix[:200,:200])
plt.colorbar()


In [None]:
exercise_index = np.random.choice(number_of_exercises)
print("Performance on exercise %s is strongly influenced by performance on the following exercises: " % index_to_exercise[exercise_index])
relations = relation_matrix[exercise_index,:]
sorted_relations = sorted(enumerate(relations), key=itemgetter(1))
for i in range(5):
  print("%s, correlation %.2f" % (index_to_exercise[sorted_relations[-(i+1)][0]], sorted_relations[-(i+1)][1]) )

In [None]:
print("Processing and windowing sequences...")
exercises, past_answers, timestamps, labels = process_rkt(student_sequences, number_of_exercises, TIME_STEPS)

In [None]:
with open(OUTPUT_DIR + 'exercises.npy','wb') as file:
    np.save(file, exercises)
with open(OUTPUT_DIR + 'interactions.npy','wb') as file:
    np.save(file, past_answers)
with open(OUTPUT_DIR + 'timestamps.npy','wb') as file:
    np.save(file, timestamps)
with open(OUTPUT_DIR + 'labels.npy','wb') as file:
    np.save(file, labels)
with open(OUTPUT_DIR + 'relation_matrix.npy','wb') as file:
    np.save(file, relation_matrix)