In [1]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import h5py
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
drive.mount('/content/drive', force_remount=True)


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
INPUT_DIR = BASE_DIR + 'data/junyi_academy/raw/'
OUTPUT_DIR = BASE_DIR + 'data/junyi_academy/processed/sakt/'

TIME_STEPS = 20
ROWS_PER_READ = 1000000

!cp '/content/drive/My Drive/Colab Notebooks/thesis/lib/preprocessing.py' .
from preprocessing import process_sakt_inplace

Mounted at /content/drive


## Load the data

In [2]:
print("Calculating total number of windows for dataset...")
data = pd.read_csv(INPUT_DIR + "cleaned.csv", encoding = "cp850")
series = data.groupby(['uuid']).agg({lambda x: list(x)})
series_np = series.to_numpy()

# Get categories
categories = np.array([np.array(x) for x in series_np[:,0]], dtype='object')
unique_categories = set(np.concatenate(categories))
number_of_exercises = len(unique_categories) + 1 

number_of_sequences = len(series_np)
total_number_of_windows = 0
for idx in range(number_of_sequences):
  number_of_samples = len(series_np[idx][0])
  number_of_windows = max(1, number_of_samples - TIME_STEPS + 1)
  total_number_of_windows += number_of_windows
print("Total number of windows: %i" % total_number_of_windows)
print("Number of exercise categories: %i" % number_of_exercises)
del data
del series
del series_np

Calculating total number of windows for dataset...
Total number of windows: 15060349
Number of exercise categories: 16217312


In [3]:
read_index = 0
# Remember that DataFrames are immutable
while(True):
  # Read rows
  data = pd.read_csv(INPUT_DIR + "cleaned.csv", encoding = "cp850", nrows=ROWS_PER_READ, skiprows=[i for i in range(1, read_index)])
  print("Pandas: read %i rows" % len(data))
  if len(data) == 0:
    break;
  # Group rows
  series = data.groupby(['uuid']).agg({lambda x: list(x)})
  series_np = series.to_numpy()

  lengths = [len(x[0]) for x in series_np]
  print("Mean sequence length for this selection of students: %.2f" % np.mean(lengths))  

  #Drop first column (redundant order column)
  series_np = series_np[:,1:3]

  print("Number of unique exercise categories: %i" % number_of_exercises)
  exs, ins, outs = process_sakt_inplace(series_np, number_of_exercises, TIME_STEPS)

  if read_index == 0:
    with h5py.File(OUTPUT_DIR + "processed.h5", 'w') as hf:
      hf.create_dataset('exercises',    data=exs,   maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
      hf.create_dataset('interactions', data=ins,   maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
      hf.create_dataset('labels',       data=outs,  maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
  else:
    with h5py.File(OUTPUT_DIR + "processed.h5", 'a') as hf:
      hf['exercises'].resize((hf['exercises'].shape[0] + exs.shape[0]), axis = 0)
      hf['exercises'][-exs.shape[0]:] = exs

      hf['interactions'].resize((hf['interactions'].shape[0] + ins.shape[0]), axis = 0)
      hf['interactions'][-ins.shape[0]:] = ins

      hf['labels'].resize((hf['labels'].shape[0] + outs.shape[0]), axis = 0)
      hf['labels'][-outs.shape[0]:] = outs

  del data
  del series
  del series_np
  read_index += ROWS_PER_READ

print("Dataset processing done!")

Pandas: read 1000000 rows
Mean sequence length for this selection of students: 18.00
Number of unique exercise categories: 16217312
Number of sequences: 55546
Total number of samples: 1000000
Total number of windows: 563439
Dataset inflation ratio: 11.268780
🐌🏠
Done!
Windows/final index: 563439/563439 
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 18.10
Number of unique exercise categories: 16217312
Number of sequences: 55250
Total number of samples: 1000000
Total number of windows: 542836
Dataset inflation ratio: 10.856720
🐌🏠
Done!
Windows/final index: 542836/542836 
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 18.41
Number of unique exercise categories: 16217312
Number of sequences: 54328
Total number of samples: 1000000
Total number of windows: 574985
Dataset inflation ratio: 11.499700
🐌🏠
Done!
Windows/final index: 574985/574985 
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.44
Num