In [1]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
from numpy.linalg import norm
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, Dropout, Lambda, TimeDistributed
from keras.layers import LSTM, Bidirectional, SimpleRNN, BatchNormalization, GRU
from keras.models import Model, load_model
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.activations import softmax
from copy import deepcopy
from google.colab import drive
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.ensemble import RandomForestClassifier
drive.mount('/content/drive', force_remount=True)


BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'
INPUT_DIR = BASE_DIR + 'data/assistments_2009/raw/'
OUTPUT_DIR = BASE_DIR + 'data/assistments_2009/processed/dkt/'
#BASE_DIR = ''

TIME_STEPS = 25
STEP_SHIFT = 1
BATCH_SIZE = 128
EPOCHS = 10
PADDING = 0
EMBEDDINGS_DIM = 5
VALIDATION_RATIO = 0.3

Mounted at /content/drive


## Load the data

In [2]:
# Remember that DataFrames are immutable
data = pd.read_csv(INPUT_DIR + "rawdata.csv", encoding = "cp850")
columns_to_drop = ['Unnamed: 0', 'order_id', 'assignment_id', 'assistment_id',
       'problem_id', 'original', 'attempt_count',
       'ms_first_response', 'tutor_mode', 'answer_type', 'sequence_id',
       'student_class_id', 'position', 'type', 'base_sequence_id', 'skill_id',
       'teacher_id', 'school_id', 'hint_count', 'hint_total',
       'overlap_time', 'template_id', 'answer_id', 'answer_text',
       'first_action', 'bottom_hint', 'opportunity', 'opportunity_original']
data = data.drop(columns_to_drop, axis='columns')

# drop rows that lack a skill_name
print(len(data))
data["skill_name"].replace('', np.nan, inplace=True)
data.dropna(subset=['skill_name'], inplace=True)
print(len(data))
data.columns

  interactivity=interactivity, compiler=compiler, result=result)


346860
274590


Index(['user_id', 'correct', 'skill_name'], dtype='object')

## Create title to index dictionary

In [3]:
categories = ['PADDING'] + list(set(data['skill_name']))
category_to_embeddings = {}

# Compress using random embeddings (compressed sensing)
for category in categories:
    random_array = np.random.normal(0.0, 1.0, EMBEDDINGS_DIM)
    category_to_embeddings[category] = random_array


## Turn categories into embeddings

In [4]:
def process_row(row):
    # Turn category into an embedding
    row['skill_name'] = category_to_embeddings[row['skill_name']]
    # Use only first outcome
    # row['SubmissionOutcomes'] = int(row['SubmissionOutcomes'][0])
    return row
data = data.apply(process_row, axis=1)

In [5]:
series = data.groupby(['user_id']).agg({lambda x: list(x)})

In [6]:
series_np = series.to_numpy()

In [7]:
def convert_series_to_data(series):
    # Extract exercises
    exercises = np.array(series[1])
    exercises_shifted = np.roll(exercises, -1, axis=0)
    # Set the embeddings of the last exercise to the 'padding' embeddings
    exercises_shifted[-1,:] = category_to_embeddings['PADDING']


    # Extract answers
    # Answer features are {1, 2} with 0 reserved for padding
    # Answer labels are {0, 1} which corresponds to probability of correctness
    answers = np.array(series[0])
    answers_shifted = np.roll(answers, -1)
    answers += 1
        
    # Pad the sequences
    pad_length = TIME_STEPS - len(exercises)
    if (pad_length < 0): pad_length = 0

    padded_exercises = np.pad(exercises, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))
    padded_exercises = padded_exercises[:, pad_length:]
    padded_exercises_shifted = np.pad(exercises_shifted, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))
    padded_exercises_shifted = padded_exercises_shifted[:, pad_length:]

    # Create output array, and pad if too short
    padded_answers = np.pad(answers, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))
    padded_answers_shifted = np.pad(answers_shifted, (pad_length, 0), 'constant', constant_values=(PADDING, PADDING))

    padded_answers = padded_answers.reshape((len(padded_answers), 1))

    # Concatenate features vector
    features = np.concatenate((padded_exercises, padded_exercises_shifted, padded_answers), axis=1)
    labels = padded_answers_shifted
    
    # Drop entries to the left so we have a discrete number of shifts
    number_of_samples = len(padded_exercises)
    entries_to_drop = (number_of_samples - TIME_STEPS) % STEP_SHIFT
    features = features[entries_to_drop:, :]
    labels = labels[entries_to_drop:]
    
    number_of_sequences = int(math.floor((number_of_samples - TIME_STEPS) / STEP_SHIFT + 1))
    
    final_features = np.ndarray((number_of_sequences, TIME_STEPS, EMBEDDINGS_DIM * 2 + 1))
    final_labels = np.ndarray((number_of_sequences, 1))
    
    for i in range(number_of_sequences):
        final_features[i, :, :] = features[i * STEP_SHIFT : i * STEP_SHIFT + TIME_STEPS, :]
        final_labels[i, :] = labels[TIME_STEPS - 1 + i * STEP_SHIFT]

    return final_features, final_labels

In [8]:
samples = len(series_np)
inputs = None
outputs = None
for idx, series in enumerate(series_np):
    input_array, output_array = convert_series_to_data(series)
    if idx == 0: 
        inputs = input_array
        outputs = output_array
        continue
    if idx % 1000 == 0:
      print(idx)  
    inputs = np.append(inputs, input_array, axis=0)
    outputs = np.append(outputs, output_array)

1000
2000
3000
4000


In [9]:
with open(OUTPUT_DIR + 'inputs.npy','wb') as file:
    np.save(file, inputs)
with open(OUTPUT_DIR + 'outputs.npy','wb') as file:
    np.save(file, outputs)

In [12]:
outputs.shape

(209102,)