In [1]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import h5py
import time
import os
import datetime
import pickle

LOCAL = True
BASE_DIR = '../'

if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'

sys.path.append(BASE_DIR + 'lib')
sys.path.append(BASE_DIR + 'config')
from preprocessing import process_sakt, save_h5, transpose_list, process_one_feature, select_from_rows, progressBar
import dataset_parameters as params

ONE_DAY = 86400

# DATASET = 'akribian'
# DATASET = 'assistments_2012'
DATASET = 'junyi_academy'
# DATASET = 'ednet'

INPUT_DIR = BASE_DIR + 'data/' + DATASET + '/raw/'
OUTPUT_DIR = BASE_DIR + 'data/' + DATASET + '/processed/'
# IN_FILE_NAME = 'rawdata.csv'
# IN_FILE_NAME = 'sorted.csv'
IN_FILE_NAME = 'transformed.csv'
FILE_NAME = 'processed.h5'

DELETE_TIME_OUTLIERS = True
MAX_TIME = params.time_scale_dict[DATASET]

TIME_STEPS = params.time_steps_dict[DATASET]
VALIDATION_RATIO = params.val_ratio_dict[DATASET]
TEST_RATIO = 0.01
SPLIT_SECTIONS = 20

ROWS_PER_READ = 10000000
SHUFFLE = params.shuffle_dict[DATASET]

# PADDING='pre'
PADDING='post'

STRIDE = params.stride_dict[DATASET]

## Load the data

In [2]:
# Columns to read from dataset
columns = params.columns_dict['generic']
group_column = columns[0]
encoding = params.encodings_dict[DATASET]
number_of_rows = -1
number_of_exercises = params.exercise_dict[DATASET]
number_of_ids = params.exercise_id_dict[DATASET]
time_scale = params.time_scale_dict[DATASET]
with open(OUTPUT_DIR + "category_to_idx.pkl", "rb") as f:
    category_to_index = pickle.load(f)
with open(OUTPUT_DIR + "id_to_idx.pkl", "rb") as f:
    id_to_index = pickle.load(f)

In [3]:
print(category_to_index)

{'PADDING': 0, '7f73q332BKPBXaixasa4EkUb+pF6VAsLxNIg4506JJs=': 1, 'R81Sqc8LAYj8amTPwFRvoPgbGpdaZoQLNX0hTg0DMB4=': 2, 'rzRcsBurW8jbUhivGAdZozPksRAZ5xM898ohJEBg93g=': 3, 'MfUX4BrIuFzJjm97tCQVisXbonyvtYtwCUJo6JpmoyU=': 4, 'jXSXg7CfDboPEXlnqJTGuQOb0VIgOXCpaU/Sl+/m3n0=': 5, '2YwsqJH0U7Zguyun1OaStQsIHbUoYvgJNK0QCGC5BQI=': 6, '5Np4fxxPeBgmNpeEOcXqarZIVsOEzZ1fSssL8cytQAc=': 7, '1EzKLzTq9Ax8/wlR9cJNrtthvk9lBi/SFdx/4L1PIaE=': 8, 'xYDz4OEv0xsri1IpmXlrgMLJ848rgySf+39xWpq4DBI=': 9, 'ICgke8JJv5eapCPwyj1aco8PEtoBkUbTZYIqxmYtqBk=': 10}


In [29]:
# Read head position for reading from file
read_index = 0
weighted_sum = 0
number_of_entries = 0

try:
  os.remove(OUTPUT_DIR + FILE_NAME)
except:
  pass

with h5py.File(OUTPUT_DIR + FILE_NAME, 'w') as hf:
  hf.create_dataset("dummy", (1,))

# Remember that DataFrames are immutable
while(True):  
  # Print progress
  if number_of_rows > 0:
    print("Reading %i/%i..." % (math.ceil(read_index/ROWS_PER_READ) + 1, math.ceil(number_of_rows/ROWS_PER_READ) + 1))
  else:  
    print("Reading %i..." % (math.ceil(read_index/ROWS_PER_READ) + 1))

  start_time = time.time()

  # Read rows
  data = pd.read_csv(INPUT_DIR + IN_FILE_NAME, encoding = encoding, nrows=ROWS_PER_READ, skiprows=[i for i in range(1, read_index)], usecols=columns)

  print("Data read in %.2f seconds" % (time.time() - start_time))

  # If all the rows are read, break out of the loop
  if len(data) == 0:
      print("Out of usable data, breaking...")
      break

  if DELETE_TIME_OUTLIERS:
    data = data[data['elapsed_time'] < MAX_TIME]

  print("90th percentile elapsed time for this selection of students: %.2f" % np.percentile(data['elapsed_time'], 90))
  print("Mean elapsed time for this selection of students: %.2f" % np.mean(data['elapsed_time']))

  sys.stdout.write("Processing data... ")

  print("Data processed.")
  
  # Group rows
  sys.stdout.write("Grouping data... ")
  series = data.groupby([group_column]).agg({lambda x: list(x)})
  del data
  series_np = series.to_numpy()
  del series
  print("Done.")
  sys.stdout.write("Transposing data... ")
  series_transposed = transpose_list(series_np)
  print("Done.")

  # Print average sequence length
  lengths = [len(x[0]) for x in series_np]
  avg_length = np.mean(lengths)
  weight = len(lengths)
  print("Average sequence length for this selection of students: %.2f" % avg_length)
  weighted_sum += avg_length * weight
  number_of_entries += weight
  print("Max sequence length for this selection of students: %.2f" % np.max(lengths))
  print("90th percentile sequence length for this selection of students: %.2f" % np.percentile(lengths, 90))
  print("95th percentile sequence length for this selection of students: %.2f" % np.percentile(lengths, 95))
  print("99th percentile sequence length for this selection of students: %.2f" % np.percentile(lengths, 99))

  # Process the rows
  sys.stdout.write("Running process_sakt... ")
  exs, ins, outs, ids = process_sakt(series_np, number_of_ids, TIME_STEPS, stride=STRIDE, padding=PADDING)

  indices = np.arange(len(exs))
  if SHUFFLE:
    np.random.shuffle(indices)

  print("Done.")
  del series_np
  save_h5(OUTPUT_DIR + FILE_NAME, 'exercises', exs[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append = read_index > 0)
  print("Saved exercises!")
  del exs
  save_h5(OUTPUT_DIR + FILE_NAME, 'exercise_ids', ids[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append = read_index > 0)
  print("Saved exercise ids!")
  del ids
  save_h5(OUTPUT_DIR + FILE_NAME, 'interactions', ins[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append = read_index > 0)
  print("Saved interactions!")
  del ins
  save_h5(OUTPUT_DIR + FILE_NAME, 'labels', outs[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append = read_index > 0)
  print("Saved labels!")
  del outs

  # Past label
  past_label = process_one_feature(series_transposed[1], TIME_STEPS, shift_data=True, dtype='float', stride=STRIDE, padding=PADDING)
  save_h5(OUTPUT_DIR + FILE_NAME, 'past_labels', past_label[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0)
  print("Saved past labels!")
  del past_label
  
  # Past elapsed time
  past_elapsed = process_one_feature(series_transposed[3], TIME_STEPS, shift_data=(DATASET != 'ednet'), dtype='float', stride=STRIDE, padding=PADDING)
  # past_elapsed = np.clip(past_elapsed, 0., 300.)
  save_h5(OUTPUT_DIR + FILE_NAME, 'elapsed', past_elapsed[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved elapsed time!")
  # Lag time
  timestamps = process_one_feature(series_transposed[4], TIME_STEPS, shift_data=False, dtype='float', stride=STRIDE, padding=PADDING)
  timestamps_shifted = process_one_feature(series_transposed[4], TIME_STEPS, shift_data=True, dtype='float', stride=STRIDE, padding=PADDING)
  timestamp_difference = timestamps - timestamps_shifted
  timestamp_difference = np.round(timestamp_difference - past_elapsed)
  del past_elapsed
  # Bound elapsed to [0, 300] and lag to [0, 1440] according to SAINT+
  timestamp_difference = np.clip(timestamp_difference, 0., ONE_DAY) / 60.
  save_h5(OUTPUT_DIR + FILE_NAME, 'timestamps', timestamp_difference[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved timestamps!")
  del timestamp_difference
  del timestamps_shifted
  del timestamps

  # Current elapsed time
  current_elapsed = process_one_feature(series_transposed[3], TIME_STEPS, shift_data=(DATASET == 'ednet'), shift_forward=True, dtype='float', stride=STRIDE, padding=PADDING)
  save_h5(OUTPUT_DIR + FILE_NAME, 'current_elapsed', current_elapsed[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved current elapsed time!")
  del current_elapsed

  # Current elapsed time
  current_elapsed_mean = process_one_feature(series_transposed[6], TIME_STEPS, shift_data=(DATASET == 'ednet'), shift_forward=True, dtype='float', stride=STRIDE, padding=PADDING)
  save_h5(OUTPUT_DIR + FILE_NAME, 'current_elapsed_mean', current_elapsed_mean[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved current_elapsed_mean!")
  del current_elapsed_mean

  current_correctness_mean = process_one_feature(series_transposed[7], TIME_STEPS, shift_data=(DATASET == 'ednet'), shift_forward=True, dtype='float', stride=STRIDE, padding=PADDING)
  save_h5(OUTPUT_DIR + FILE_NAME, 'current_correctness_mean', current_correctness_mean[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved current_correctness_mean!")
  del current_correctness_mean


  past_elapsed_zscore = process_one_feature(series_transposed[5], TIME_STEPS, shift_data=(DATASET != 'ednet'), dtype='float', stride=STRIDE, padding=PADDING)
  save_h5(OUTPUT_DIR + FILE_NAME, 'past_elapsed_zscore', past_elapsed_zscore[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  print("Saved past_elapsed_zscore!")
  del past_elapsed_zscore

  # current_elapsed_zscore = process_one_feature(series_transposed[5], TIME_STEPS, shift_data=(DATASET == 'ednet'), shift_forward=True, dtype='float', stride=STRIDE, padding=PADDING)
  # save_h5(OUTPUT_DIR + FILE_NAME, 'current_elapsed_zscore', current_elapsed_zscore[indices], validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0, dtype='float')
  # print("Saved current_elapsed_zscore!")
  # del current_elapsed_zscore

  # past_latency = process_one_feature(series_transposed[6], TIME_STEPS, shift_data=(DATASET != 'ednet'), dtype='float', stride=STRIDE)
  # save_h5(OUTPUT_DIR + FILE_NAME, 'past_latency', past_latency, validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0)
  # print("Saved past_latency!")
  # del past_latency

  # current_latency = process_one_feature(series_transposed[6], TIME_STEPS, shift_data=(DATASET == 'ednet'), shift_forward=True, dtype='float', stride=STRIDE)
  # save_h5(OUTPUT_DIR + FILE_NAME, 'current_latency', current_latency, validation_ratio=VALIDATION_RATIO, test_ratio=TEST_RATIO, split_sections=SPLIT_SECTIONS, append=read_index>0)
  # print("Saved current_latency!")
  # del current_latency

  del series_transposed
  print("Done.")

  # Delete unnecessary variables to free up ram
  read_index += ROWS_PER_READ

  # test brake, uncomment to only use first n rows
  # break

print("Dataset processing done!")
print("Global average sequence length for this dataset: %.2f" % (weighted_sum / number_of_entries))

Reading 1...
Data read in 10.48 seconds
90th percentile elapsed time for this selection of students: 33.75
Mean elapsed time for this selection of students: 20.74
Processing data... Data processed.
Grouping data... Done.
Transposing data... Done.
Average sequence length for this selection of students: 223.30
Max sequence length for this selection of students: 13354.00
90th percentile sequence length for this selection of students: 512.00
95th percentile sequence length for this selection of students: 1018.00
99th percentile sequence length for this selection of students: 2930.28
Running process_sakt... Done.
Saved exercises!
Saved exercise ids!
Saved interactions!
Saved labels!
Saved past labels!
Saved elapsed time!
Saved timestamps!
Saved current elapsed time!
Saved current_elapsed_mean!
Saved current_correctness_mean!
Saved past_elapsed_zscore!
Done.
Reading 2...
Data read in 18.26 seconds
90th percentile elapsed time for this selection of students: 33.67
Mean elapsed time for this s

In [30]:
idx = 100
with h5py.File(OUTPUT_DIR + FILE_NAME, 'r') as hf:
  print(hf['exercises_train'].shape)
  print(hf['exercise_ids_train'].shape)
  print(hf['interactions_train'].shape)
  print(hf['labels_train'].shape)
  print(hf['elapsed_train'].shape)
  print(hf['timestamps_train'].shape)

  print(hf['exercises_val'].shape)
  print(hf['exercise_ids_val'].shape)
  print(hf['interactions_val'].shape)
  print(hf['labels_val'].shape)
  print(hf['elapsed_val'].shape)
  print(hf['timestamps_val'].shape)

  print(hf['exercises_test'].shape)
  print(hf['exercise_ids_test'].shape)
  print(hf['interactions_test'].shape)
  print(hf['labels_test'].shape)
  print(hf['elapsed_test'].shape)
  print(hf['timestamps_test'].shape)

(385020, 1728)
(385020, 1728)
(385020, 1728)
(385020, 1728)
(385020, 1728)
(385020, 1728)
(20360, 1728)
(20360, 1728)
(20360, 1728)
(20360, 1728)
(20360, 1728)
(20360, 1728)
(3980, 1728)
(3980, 1728)
(3980, 1728)
(3980, 1728)
(3980, 1728)
(3980, 1728)


In [31]:
with h5py.File(OUTPUT_DIR + FILE_NAME, 'r') as hf:
  print("Unique exercise categories:")
  print(len(list(set(hf['exercises_train'][:].flatten()))))
  print(len(list(set(hf['exercises_val'][:].flatten()))))
  # print(len(list(set(hf['exercises_test'][:].flatten()))))
  print("Max of exercise categories:")
  print(np.max(hf['exercises_train'][:].flatten()))
  print(np.max(hf['exercises_val'][:].flatten()))
  # print(np.max(hf['exercises_test'][:].flatten()))
  train = hf['labels_train'][:].flatten()

Unique exercise categories:
8
8
Max of exercise categories:
8
8


In [32]:
# Check for arrays consisting only of padding
print("Number of padding-only entries:")
with h5py.File(OUTPUT_DIR + FILE_NAME, 'r') as hf:
  print(np.count_nonzero(np.all(hf['exercise_ids_train'][:] == 0, axis=1)))
  print(np.count_nonzero(np.all(hf['exercise_ids_val'][:] == 0, axis=1)))
  # print(np.count_nonzero(np.all(hf['exercise_ids_test'][:] == 0, axis=1)))

Number of padding-only entries:
0
0


In [33]:
# Data example
index = 100
print("Sample test data:")
with h5py.File(OUTPUT_DIR + FILE_NAME, 'r') as hf:
  print("\nID, cat, interaction:")
  print(hf['exercise_ids_val'][index])
  print(hf['exercises_val'][index])
  print(hf['interactions_val'][index])
  
  print("\nLabel, past label:")  
  print(hf['labels_val'][index])
  print(hf['past_labels_val'][index])
  
  print("\nTimestamps:")    
  print(hf['timestamps_val'][index])

  print("\nElapsed, past elapsed:")    
  print(hf['current_elapsed_val'][index])
  print(hf['elapsed_val'][index])

  print("\nMean elapsed:")    
  print(hf['current_elapsed_mean_val'][index])
  
  print("\nMean correct:")    
  print(hf['current_correctness_mean_val'][index])

Sample test data:

ID, cat, interaction:
[ 176 1279 2064 ...    0    0    0]
[2 3 4 ... 0 0 0]
[ 7877   176 14804 ...     0     0     0]

Label, past label:
[0 1 0 ... 0 0 0]
[0 0 1 ... 0 0 0]

Timestamps:
[0.11666667 0.06666667 0.51666667 ... 0.         0.         0.        ]

Elapsed, past elapsed:
[22. 17. 17. ...  0.  0.  0.]
[21. 22. 17. ...  0.  0.  0.]

Mean elapsed:
[23.30789581 18.82637356 18.82637356 ...  0.          0.
  0.        ]

Mean correct:
[0.64918336 0.16311924 0.63365769 ... 0.         0.         0.        ]
