In [18]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import h5py
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime
# import pytz
drive.mount('/content/drive', force_remount=True)


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
INPUT_DIR = BASE_DIR + 'data/assistments_2012/raw/'
OUTPUT_DIR = BASE_DIR + 'data/assistments_2012/processed/sakt_plus/'

TIME_STEPS = 20
ROWS_PER_READ = 1000000

!cp '/content/drive/My Drive/Colab Notebooks/thesis/lib/preprocessing.py' .
from preprocessing import process_sakt_plus

Mounted at /content/drive


## Load the data

In [6]:
print("Calculating total number of windows for dataset...")
columns = ['user_id', 'skill']
data = pd.read_csv(INPUT_DIR + "rawdata.csv", usecols=columns, encoding = "cp850")
series = data.groupby(['user_id']).agg({lambda x: list(x)})
series_np = series.to_numpy()

# Get categories
categories = np.array([np.array(x) for x in series_np[:,0]], dtype='object')
unique_categories = set(np.concatenate(categories))
exercises = ['PADDING'] + list(unique_categories)
number_of_exercises = len(exercises)
exercise_to_index = {cat:idx for (idx, cat) in enumerate(exercises)}
index_to_exercise = {v:k for (k,v) in exercise_to_index.items()}
print("Number of exercises: %i" % number_of_exercises)

number_of_sequences = len(series_np)
total_number_of_windows = 0
for idx in range(number_of_sequences):
  number_of_samples = len(series_np[idx][0])
  number_of_windows = max(1, number_of_samples - TIME_STEPS + 1)
  total_number_of_windows += number_of_windows
print("Total number of windows: %i" % total_number_of_windows)
print("Number of exercise categories: %i" % number_of_exercises)
del data
del series
del series_np

Calculating total number of windows for dataset...
Number of exercises: 200
Total number of windows: 5383952
Number of exercise categories: 200


In [14]:
print(exercise_to_index.keys())

dict_keys(['PADDING', 'Distributive Property', 'Quadratic Equation Solving', 'Histogram as Table or Graph', 'Conversion of Fraction Decimals Percents', 'English and Metric Terminology', 'Solve Quadratic Equations Using Factoring', 'Line Plot', 'Finding Slope from Ordered Pairs', 'Area Irregular Figure', 'Parallel and Perpendicular Lines', 'Volume Sphere', 'Finding Slope From Equation', 'Recognizing Equivalent Expressions', 'Parts of a Polyomial, Terms, Coefficient, Monomial, Exponent, Variable', 'Ordering Fractions', 'Complementary and Supplementary Angles', 'Scientific Notation', 'Addition Proper Fractions', 'Square Roots', 'Calculation with + - * /', 'Recognize Quadratic Pattern', 'Interior Angles Figures with More than 3 Sides', 'Graphing Linear Equations', 'Multiplication Positive Decimals', 'Reflection', 'Volume Prism', 'Bar Graph', 'Solving Systems of Linear Equations by Graphing', 'Percent Increase or Decrease', 'Geometric Definitions', 'Multiplication Whole Numbers', 'Expanded,

In [22]:
epoch = datetime.utcfromtimestamp(0)
# epoch = epoch.replace(tzinfo=pytz.UTC)

def process_row(row):
    # Turn category into an embedding
    row['skill'] = exercise_to_index[row['skill']]
    # Convert time to numeric for easier storage
    # Hours since epoch
    hours_since_epoch = (row['start_time'] - epoch).total_seconds() / 3600.
    row['start_time'] = hours_since_epoch
    return row

columns = ['user_id', 'skill', 'correct', 'start_time']
read_index = 0
while(True):
  # Read rows
  data = pd.read_csv( INPUT_DIR + "rawdata.csv",
                      encoding = "cp850", 
                      nrows=ROWS_PER_READ,
                      usecols=columns, 
                      skiprows=[i for i in range(1, read_index)])
  print("Pandas: read %i rows" % len(data))

  if len(data) == 0:
    break
  # Convert dates to datetime objects and remove rows without skill tag
  data['start_time'] = pd.to_datetime(data['start_time'])
  data["skill"].replace('', np.nan, inplace=True)
  data.dropna(subset=['skill'], inplace=True)
  
  # Process rows
  data = data.apply(process_row, axis=1)

  # Group rows
  series = data.groupby(['user_id']).agg({lambda x: list(x)})
  series_np = series.to_numpy()

  lengths = [len(x[0]) for x in series_np]
  print("Mean sequence length for this selection of students: %.2f" % np.mean(lengths))  

  print("Number of unique exercise categories: %i" % number_of_exercises)
  exs, ins, time_delta, outs = process_sakt_plus(series_np, number_of_exercises, TIME_STEPS)

  if read_index == 0:
    with h5py.File(OUTPUT_DIR + "processed.h5", 'w') as hf:
      hf.create_dataset('exercises',    data=exs,   maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
      hf.create_dataset('interactions', data=ins,   maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
      hf.create_dataset('time_delta', data=ins,   maxshape=(None, TIME_STEPS), dtype='float', chunks=True)
      hf.create_dataset('labels',       data=outs,  maxshape=(None, TIME_STEPS), dtype='int', chunks=True)
  else:
    with h5py.File(OUTPUT_DIR + "processed.h5", 'a') as hf:
      hf['exercises'].resize((hf['exercises'].shape[0] + exs.shape[0]), axis = 0)
      hf['exercises'][-exs.shape[0]:] = exs

      hf['interactions'].resize((hf['interactions'].shape[0] + ins.shape[0]), axis = 0)
      hf['interactions'][-ins.shape[0]:] = ins

      hf['time_delta'].resize((hf['time_delta'].shape[0] + ins.shape[0]), axis = 0)
      hf['time_delta'][-ins.shape[0]:] = time_delta

      hf['labels'].resize((hf['labels'].shape[0] + outs.shape[0]), axis = 0)
      hf['labels'][-outs.shape[0]:] = outs

  del data
  del series
  del series_np
  read_index += ROWS_PER_READ

print("Dataset processing done!")

Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.05
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.01
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.07
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.09
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.07
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 1000000 rows
Mean sequence length for this selection of students: 17.14
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 123271 rows
Mean sequence length for this selection of students: 3.43
Number of unique exercise categories: 200
🐌🏠
Done!
Pandas: read 0 rows
Dataset processing done!
