In [19]:
import sys
import regex as re
import numpy as np
import io
import matplotlib.pyplot as plt
import pandas as pd
import math
import h5py
import time
import os
import datetime
import pickle
from scipy.stats import zscore

LOCAL = True
BASE_DIR = '../'

if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    BASE_DIR = '/content/drive/My Drive/Colab Notebooks/thesis/'

sys.path.append(BASE_DIR + 'lib')
sys.path.append(BASE_DIR + 'config')
from preprocessing import process_sakt, save_h5, transpose_list, process_one_feature, select_from_rows, progressBar
import dataset_parameters as params

ONE_DAY = 86400

# DATASET = 'akribian'
# DATASET = 'assistments_2012'
# DATASET = 'junyi_academy'
DATASET = 'ednet'

INPUT_DIR = BASE_DIR + 'data/' + DATASET + '/raw/'
OUTPUT_DIR = BASE_DIR + 'data/' + DATASET + '/processed/sakt/'
# IN_FILE_NAME = 'rawdata.csv'
IN_FILE_NAME = 'sorted.csv'
FILE_NAME = 'transformed.csv'

TIME_STEPS = params.time_steps_dict[DATASET]
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.05
SPLIT_SECTIONS = 5

Z_SCORE_CUTOFF = 0.33

SHUFFLE = False

STRIDE = params.stride_dict[DATASET]
CALCULATE_NUM_EXERCISES = False

## Load the data

In [20]:
# Columns to read from dataset
columns = params.columns_dict[DATASET]
group_column = columns[0]
exercise_columns = columns[1]
correctness_columns = columns[2]
exercise_id_columns = columns[3]
response_time_columns = columns[4]
encoding = params.encodings_dict[DATASET]

# Calculate how many tags and exercises we have
if CALCULATE_NUM_EXERCISES:
    cols = [exercise_columns, exercise_id_columns, response_time_columns]

    print("Calculating number of unique exercises for dataset...")
    start_time = time.time()
    print("Reading data...")
    data = pd.read_csv(INPUT_DIR + IN_FILE_NAME, encoding = encoding, usecols=cols)
    data = data[cols]

    if DATASET == 'assistments_2009':
        # Destroy rows lacking skill name
        data["skill_name"].replace('', np.nan, inplace=True)
        data.dropna(subset=['skill_name'], inplace=True)

    if DATASET == 'assistments_2012':
        # Destroy rows lacking skill name
        data["skill"].replace('', np.nan, inplace=True)
        data.dropna(subset=['skill'], inplace=True)

    if DATASET == 'ednet':
        # Filter the dataset so we only use the rows with content type 0 (question)
        data = data[data['content_type_id'] == 0]
        data.dropna(subset=[response_time_columns], inplace=True)
        metadata = pd.read_csv(INPUT_DIR + "questions.csv", usecols=['question_id', 'part'])
        metadata.rename(columns={"question_id": "content_id"} ,inplace=True)
        data = data.join(metadata, on='content_id', how='left', rsuffix='_dupe')    
        data = data[['part', exercise_id_columns, response_time_columns]]

    if DATASET == 'junyi_academy':
        # Filter the dataset so we only use the rows with content type 0 (question)
        metadata = pd.read_csv(INPUT_DIR + "Info_Content.csv", encoding = encoding, usecols=['ucid', 'level2_id'])
        data = data.merge(metadata, on='ucid', how='left', suffixes=('', '_dupe'))    
        data.dropna(subset=['ucid'], inplace=True)
        data = data[['level2_id', exercise_id_columns, response_time_columns]]

    print("Data read in %.2f seconds" % (time.time() - start_time))
    print("Convering to numpy...")
    series_np = data.to_numpy()
    series_transposed = transpose_list(series_np)
    series_np = series_transposed[0]
    series_np_id = series_transposed[1]
    series_times = series_transposed[2]

    # Get categories
    print("Creating set...")
    number_of_rows = len(series_np)
    unique_categories = ['PADDING'] + list(set(series_np))
    unique_ids = ['PADDING'] + list(set(series_np_id))
    number_of_exercises = len(unique_categories)
    number_of_ids = len(unique_ids)
    time_scale = np.percentile(series_times, 75)
    time_mean = np.mean(series_times)
    time_std = np.std(series_times)
    print("Number of exercise tags: %i" % number_of_exercises)
    if (DATASET == 'ednet'):
        print("Max exercise id: ", np.max(data['content_id'][:]))
    print("Number of exercise ids: %i" % number_of_ids)
    print("Number of rows: %i" % number_of_rows)
    print("Average elapsed time: %.2f" % time_mean)
    print("Std.dev elapsed time: %.2f" % time_std)
    print("75th percentile elapsed time: %.2f" % time_scale)

    print("Deleting data")
    del data
    del series_np
    print("Deleted!")

    print("Creating category dictionaries...")    

    category_to_index = {cat:idx for (idx, cat) in enumerate(unique_categories)}
    index_to_category = {v:k for (k,v) in category_to_index.items()}
    id_to_index = {cat:idx for (idx, cat) in enumerate(unique_ids)}
    index_to_id = {v:k for (k,v) in id_to_index.items()}

    save_data = {
        number_of_exercises: number_of_exercises,
        number_of_ids: number_of_ids,
        time_scale: time_scale,
        time_mean: time_mean,
        time_std: time_std,
    }

    with open(OUTPUT_DIR + "category_to_idx.pkl", "wb") as f:
        pickle.dump(category_to_index, f)
    with open(OUTPUT_DIR + "id_to_idx.pkl", "wb") as f:
        pickle.dump(id_to_index, f)
    with open(OUTPUT_DIR + "save_data.pkl", "wb") as f:
        pickle.dump(save_data, f)

    
    print("Done!")

# Otherwise just load the values from the files
else:
    number_of_rows = -1
    number_of_exercises = params.exercise_dict[DATASET]
    number_of_ids = params.exercise_id_dict[DATASET]
    time_scale = params.time_scale_dict[DATASET]
    with open(OUTPUT_DIR + "category_to_idx.pkl", "rb") as f:
        category_to_index = pickle.load(f)
    with open(OUTPUT_DIR + "id_to_idx.pkl", "rb") as f:
        id_to_index = pickle.load(f)

In [21]:
epoch = datetime.datetime(1970,1,1)
def pr_akribian(row):
    # Turn category into integer
    row['LearningSequenceTitle'] = category_to_index[row['LearningSequenceTitle']]
    row['ExerciseTitle'] = id_to_index[row['ExerciseTitle']]
    # Use only first outcome
    row['SubmissionOutcomes'] = int(row['SubmissionOutcomes'][0])
    # Count timeout as wrong answer
    row['SubmissionOutcomes'] = min(row['SubmissionOutcomes'], 1)
    # Invert so 0 is wrong and 1 is right
    row['SubmissionOutcomes'] = 1 - row['SubmissionOutcomes']
    # "11/23/2020 09:40:14 +00:00"
    date = pd.to_datetime(row['FinishedOn'][:-7], format='%m/%d/%Y %H:%M:%S') - epoch
    row['FinishedOn'] = int(date.total_seconds())
    return row

def pr_assistments_2009(row):
    # Turn category into integer
    row['skill_name'] = category_to_index[row['skill_name']]
    row['problem_id'] = id_to_index[row['problem_id']]
    return row

def pr_assistments_2012(row):
    # Turn category into integer
    row['skill'] = category_to_index[row['skill']]
    row['problem_id'] = id_to_index[row['problem_id']]
    # 2012-10-22 18:44:03.013
    date = pd.to_datetime(row['end_time'], format='%Y-%m-%d %H:%M:%S.%f') - epoch
    row['end_time'] = int(date.total_seconds())
    return row

def pr_junyi(row):
    # Turn category into integer
    row['ucid'] = id_to_index[row['ucid']]
    row['level2_id'] = category_to_index[row['level2_id']]
    # 2019-05-17 16:30:00 UTC    
    date = pd.to_datetime(row['timestamp_TW'][:-4], format='%Y-%m-%d %H:%M:%S') - epoch
    row['timestamp_TW'] = int(date.total_seconds())
    return row

process_row_dict = {
    'akribian': pr_akribian,
    'assistments_2009': pr_assistments_2009,
    'assistments_2012': pr_assistments_2012,
    'junyi_academy': pr_junyi,
}

In [22]:
print(category_to_index)

{'PADDING': 0, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4, 5.0: 5, 6.0: 6, 7.0: 7}


In [23]:
# Read rows
print("Reading data...")

start_time = time.time()
data = pd.read_csv(INPUT_DIR + IN_FILE_NAME, encoding = encoding, usecols=columns)
print("Data read in %.2f seconds" % (time.time() - start_time))

data = data[columns]

sys.stdout.write("Processing data... ")

# Filter bad data from dataset (depends entirely on the dataset used)
if DATASET == 'ednet':
  # Filter the dataset so we only use the rows with content type 0 (question)
  data = data[data['content_type_id'] == 0]
  data.dropna(subset=[response_time_columns], inplace=True)
  metadata = pd.read_csv(INPUT_DIR + "questions.csv", encoding = encoding, usecols=['question_id', 'part'])
  metadata.rename(columns={"question_id": "content_id"}, inplace=True)
  data = data.join(metadata, on='content_id', how='left', rsuffix='_dupe')
  data.dropna(subset=['content_id'], inplace=True)
  data = data[['user_id', 'part' ,'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'timestamp']]
  data[['timestamp']] = data[['timestamp']] / 1000.
  data[['content_id']] = data[['content_id']] + 1
  data[['part']] = data[['part']] + 1

if DATASET == 'junyi_academy':
  # Filter the dataset so we only use the rows with content type 0 (question)
  metadata = pd.read_csv(INPUT_DIR + "Info_Content.csv", encoding = encoding, usecols=['ucid', 'level2_id'])
  data = data.merge(metadata, on='ucid', how='left', suffixes=('', '_dupe'))
  data.dropna(subset=['ucid'], inplace=True)
  data = data[['uuid', 'level2_id' ,'is_correct', 'ucid', 'total_sec_taken', 'timestamp_TW']]

if DATASET == 'assistments_2009':
  # Destroy rows lacking skill name
  orig_len = len(data)
  data["skill_name"].replace('', np.nan, inplace=True)
  data.dropna(subset=['skill_name'], inplace=True)  
  print("Filtered %i rows" % (orig_len - len(data)))

if DATASET == 'assistments_2012':
  # Destroy rows lacking skill name
  orig_len = len(data)
  data["skill"].replace('', np.nan, inplace=True)
  data.dropna(subset=['skill'], inplace=True)
  print("Filtered %i rows" % (orig_len - len(data)))


print("Data processed.")

# Apply row-wise transformation, if any is defined, otherwise don't
if DATASET in process_row_dict.keys():
  process_row = process_row_dict[DATASET]
  print("Applying row-wise transformation...")
  data = data.apply(process_row, axis=1)

print("Done.")

Reading data...
Data read in 183.94 seconds
Processing data... Data processed.
Done.


In [24]:
# 4 is elapsed time, 5 is timestamp
print("Applying final adjustments...")

if DATASET == "assistments_2012" or DATASET == "ednet":
    data.iloc[:,4] = data.iloc[:,4] / 1000.

print("Done.")

Applying final adjustments...
Done.


In [25]:
mapping = {
  data.columns[0]: 'user_id',
  data.columns[1]: 'category',
  data.columns[2]: 'correctness',
  data.columns[3]: 'exercise_id',
  data.columns[4]: 'elapsed_time',
  data.columns[5]: 'timestamp',
}

data = data.rename(columns = mapping)

print("Calculating z-score for elapsed time...")
data['elapsed_zscore'] = data.groupby(['exercise_id']).elapsed_time.transform(lambda x: zscore(x)).fillna(0)
print("Calculating mean for elapsed time...")
data['elapsed_mean'] = data.groupby(['exercise_id']).elapsed_time.transform(lambda x: np.mean(x)).fillna(0)
print("Calculating mean for correctness...")
data['correctness_mean'] = data.groupby(['exercise_id']).correctness.transform(lambda x: np.mean(x)).fillna(0)

print("Dataset processing done!")

Calculating z-score for elapsed time...
  return (a - mns) / sstd
Calculating mean for elapsed time...
Calculating mean for correctness...
Dataset processing done!


In [26]:
data

Unnamed: 0,user_id,category,correctness,exercise_id,elapsed_time,timestamp,elapsed_zscore,elapsed_mean,correctness_mean
1,115,6,1,5717,37.0,56.943,0.836764,21.724882,0.735190
2,115,2,1,129,55.0,118.363,1.874930,23.471403,0.984092
3,115,2,1,7861,19.0,131.167,-0.045952,19.462747,0.954813
4,115,2,1,7923,11.0,137.965,-1.208912,20.110520,0.953215
5,115,2,1,157,5.0,157.063,-1.207056,18.810512,0.931887
...,...,...,...,...,...,...,...,...,...
101230327,2147482888,6,1,3587,18.0,428564.420,-0.306540,24.781216,0.741063
101230328,2147482888,6,1,6342,14.0,428585.000,-0.510085,24.827936,0.528547
101230329,2147482888,6,1,4213,14.0,428613.475,-0.534798,25.972512,0.616541
101230330,2147482888,6,0,6344,22.0,428649.406,-0.127090,24.741762,0.665904


In [27]:
start_time = time.time()
print("Writing data to csv...")
data.to_csv(INPUT_DIR + FILE_NAME)
print("Data written in %.2f seconds" % (time.time() - start_time))

Writing data to csv...
Data written in 1168.41 seconds
