In [35]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import pickle
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

LOCAL = True
BASE_DIR = '../'

if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'

sys.path.append(BASE_DIR + 'lib')
sys.path.append(BASE_DIR + 'config')
from preprocessing import process_sakt, process_sakt_inplace
import dataset_parameters as params


DATASET = 'akribian'
# DATASET = 'assistments_2009'

INPUT_DIR = BASE_DIR + 'data/' + DATASET + '/raw/'
OUTPUT_DIR = BASE_DIR + 'data/' + DATASET + '/processed/sakt/'

TIME_STEPS = params.time_steps_dict[DATASET]

## Load the data

In [36]:
# Remember that DataFrames are immutable
columns = params.columns_dict[DATASET]
data = pd.read_csv(INPUT_DIR + "rawdata.csv", usecols=columns)
data = data[columns]
data.columns

Index(['StudentId', 'LearningSequenceTitle', 'SubmissionOutcomes'], dtype='object')

In [37]:
# Dataset-dependent actions
if DATASET == 'assistments_2009':
    data["skill_name"].replace('', np.nan, inplace=True)
    data.dropna(subset=['skill_name'], inplace=True)

## Create title to index dictionary

In [38]:
categories = ['PADDING'] + list(set(data.iloc[:, 1]))
number_of_exercises = len(categories)
category_to_index = {cat:idx for (idx, cat) in enumerate(categories)}
index_to_category = {v:k for (k,v) in category_to_index.items()}

In [39]:
def pr_akribian(row):
    # Turn category into an embedding
    row['LearningSequenceTitle'] = category_to_index[row['LearningSequenceTitle']]
    # Use only first outcome
    row['SubmissionOutcomes'] = int(row['SubmissionOutcomes'][0])
    return row

def pr_assistments_2009(row):
    # Turn category into an embedding
    row['skill_name'] = category_to_index[row['skill_name']]
    return row

process_row_dict = {
    'akribian': pr_akribian,
    'assistments_2009': pr_assistments_2009,   
}

process_row = process_row_dict[DATASET]
data = data.apply(process_row, axis=1)


## Turn categories into indices

In [40]:
group_column_name = params.columns_dict[DATASET][0]
series = data.groupby([group_column_name]).agg({lambda x: list(x)})

In [41]:
series_np = series.to_numpy()
print("Average exercises per student: ")
np.mean(list(map(len,series_np[:,0])))
#list(map(len,series_np[:,0]))

Average exercises per student: 


618.7053571428571

In [42]:
exs, ins, outs = process_sakt_inplace(series_np, number_of_exercises, TIME_STEPS)

Number of sequences: 112
Total number of samples: 69295
Total number of windows: 42563
Dataset inflation ratio: 184.268706
🐌🏠
Done!
Windows/final index: 42563/42563 


In [43]:
with open(OUTPUT_DIR + 'exercises.npy','wb') as file:
    np.save(file, exs)
with open(OUTPUT_DIR + 'interactions.npy','wb') as file:
    np.save(file, ins)
with open(OUTPUT_DIR + 'labels.npy','wb') as file:
    np.save(file, outs)

In [44]:
print(exs.shape)
print(ins.shape)
print(outs.shape)

(42563, 300)
(42563, 300)
(42563, 300)


In [45]:
idx = 100
print(exs[idx])
print(ins[idx])
print(outs[idx])

[214. 214. 214. 214. 214. 214.  39. 203. 203. 203. 203. 203. 203. 203.
 203. 203. 203. 203. 203. 203. 203. 203. 203. 203. 203. 203. 203.  34.
 155. 119. 119. 119. 119. 119. 119. 119. 119. 119. 119. 119. 119. 119.
 119. 151. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210. 210.
 210. 210. 210. 210. 210. 210. 210.   7.  24.  24.  24.  24.  24.  24.
  24.  24.  24.  24.  24.  24. 193.  33.  33.  33.  33.  33.  33.  33.
  33.  33.  33.  33.  33.  33.  33.  33.  33.  33.  33.  33.  33.  33.
  33. 185. 185. 185. 185. 185. 185. 185. 185. 185. 185. 185. 185. 185.
 185. 185. 185. 185. 185. 185. 185. 185. 185. 185. 181.  35. 197.  71.
  78.  78.  78.  78.  78.  78.  78. 228. 104. 104. 104. 104. 104. 104.
 104. 212. 103. 206. 206. 206. 206. 206. 206. 128. 128. 128. 128. 128.
 128. 128. 128.  93.  93.  93.  93.  93.  93.  93.  93.  93.  93. 132.
 132. 132. 132. 132. 132. 132. 132. 132. 132. 132. 132. 219. 124.  87.
  87.  87.  87.  87.  87.  53.  53.  53.  53.  53.  53.  53.  53.  53.
  55. 

In [46]:
print(len(list(set(ins.flatten()))))
print(number_of_exercises)

444
239
