# data_processor.ipynb

This file contains the code for the data processing of the project, separating the data into training and testing sets, and creating disjoint subsequences.

In [39]:
# Importing the libraries and reading data stream source file

import pandas as pd
import numpy as np
df = pd.read_csv('dataverse_files/2_DBE_KT22_datafiles_100102_csv/Transaction.csv')

print("Total number of unique students", str(df['student_id'].nunique()))

1264


In [40]:
# Converting the answer_state column from true / false (which was in the original data) to 1 / 0 for our own representation
df['answer_state'] = df['answer_state'].map({True: 1, False: 0})
df['hint_used'] = df['hint_used'].map({True: 1, False: 0})

# Check if the format is now correct
df.head()

Unnamed: 0,id,selection_change,start_time,end_time,difficulty_feedback,trust_feedback,answer_state,answer_text,student_id,hint_used,question_id,answer_choice_id,is_hidden
0,35,0,2019-08-07 17:12:08.722 -0700,2019-08-07 17:12:08.721 -0700,1,3,1,,5,0,36,121,False
1,38,0,2019-08-10 08:28:12.116 -0700,2019-08-10 08:28:12.116 -0700,3,1,0,,5,0,37,125,False
2,39,0,2019-08-10 08:33:03.479 -0700,2019-08-10 08:33:03.478 -0700,1,1,1,,5,0,2,7,False
3,40,0,2019-08-10 08:40:25.411 -0700,2019-08-10 08:40:25.411 -0700,0,2,1,,5,0,5,18,False
4,41,0,2019-08-10 08:51:39.062 -0700,2019-08-10 08:51:39.062 -0700,3,2,0,,5,0,3,11,False


In [41]:
# Read metadata of each question
question_details_df = pd.read_csv('dataverse_files/2_DBE_KT22_datafiles_100102_csv/Questions.csv')

# Check if it is loaded correctly
question_details_df.head()

Unnamed: 0,id,question_rich_text,question_title,explanation,hint_text,question_text,difficulty
0,219,"Consider two transactions <img src=""http://lat...",Q10-20,,,Consider two transactions and which are execut...,3
1,218,"Consider two transactions <img src=""http://lat...",Q10-19,,,Consider two transactions and which are execut...,3
2,217,Consider the following two transactions <img s...,Q10-18,,,"Consider the following two transactions and , ...",3
3,216,"Consider two transactions <img src=""http://lat...",Q10-17,,,"Consider two transactions and , and the follow...",3
4,214,"Consider the table Worker(name, payrate, hours...",Q10-16,,,"Consider the table Worker(name, payrate, hours...",2


In [4]:
# Find all unique students in our data and shuffle them
all_student_ids = df['student_id'].unique().tolist()
np.random.seed(42)
np.random.shuffle(all_student_ids)

# Split the data into train, validation and test sets. For train and validation, use 3-fold cross validation. (In this project, we will not use folds other than the first fold, due to computational costs)
from sklearn.model_selection import KFold

test_ids = all_student_ids[int(0.85 * len(all_student_ids)):]
train_val_student_ids = all_student_ids[:int(0.85 * len(all_student_ids))]

kf = KFold(n_splits=3)
train_ids = []
val_ids = []
for train_index, val_index in kf.split(train_val_student_ids):
    train_ids.append(np.array(train_val_student_ids)[train_index])
    val_ids.append(np.array(train_val_student_ids)[val_index])

In [42]:
# Extract the train and val ID lists for each fold

train_fold_1_ids = list(train_ids[0])
val_fold_1_ids = list(val_ids[0])
train_fold_2_ids = list(train_ids[1])
val_fold_2_ids = list(val_ids[1])
train_fold_3_ids = list(train_ids[2])
val_fold_3_ids = list(val_ids[2])

In [44]:
import json
from tqdm import tqdm

class DataEntry:
    """
    Class for saving the data entries (subsequences) extracted from the raw data.

    Args:
        student_id: ID of the student
        question_ids: list of question IDs the student has answered
        answers: list of answers the student has given to the questions
        difficulties: list of difficulties of the questions (integer values from 1 to 3, set by the instructor)
        hint_used: list of whether the student has used a hint for the question (we did not use this information in our project, as we did not have enough computational budget)
    """

    student_id = -1
    question_ids = []
    answers = []
    difficulties = []
    hint_used = []

    def __init__(self, student_id, question_ids, answers, difficulties, hint_used):
        self.student_id = student_id
        self.question_ids = question_ids
        self.answers = answers
        self.difficulties = difficulties
        self.hint_used = hint_used

    def __str__(self): # for printing the object
        return f'student_id: {self.student_id}, question_ids: {self.question_ids}, answers: {self.answers}, difficulties: {self.difficulties}, hint_used: {self.hint_used}'
    
    def __eq__(self, other): # necessary to find duplicates in a list of DataEntry objects
        if self.student_id != other.student_id:
            return False # two similar subsequences but with different student IDs are not considered duplicates
        for i in range(len(self.question_ids)):
            if self.question_ids[i] != other.question_ids[i] or self.answers[i] != other.answers[i] or self.difficulties[i] != other.difficulties[i] or self.hint_used[i] != other.hint_used[i]: # if any field is different, the two subsequences are not duplicates
                return False
        return True

In [45]:
def get_question_difficulty(question_id):
    """
    Function to get the difficulty of a question, given its ID.

    Args:
        question_id: ID of the question

    Returns:
        difficulty: difficulty of the question (integer value from 1 to 3)
    """

    question_details = question_details_df[question_details_df['id'] == question_id] # search in all question metadata for the question with the given ID in the input of the function
    difficulty = question_details['difficulty'].tolist()[0]
    return difficulty

In [46]:
def get_all_data_entries(source_ids, n):
    """
    Function to extract all disjoint subsequences of length n from the raw data.

    Args:
        source_ids: list of student IDs to extract subsequences from
        n: length of each subsequence

    Returns:
        all_data_entries: list of all disjoint subsequences of length n from the raw data which are extracted from the students with the given IDs in the `source_ids` input
    """

    all_data_entries = []
    for student_id in source_ids:
        student_df = df[df['student_id'] == student_id].copy() # only keep the rows of the student with the given ID
        student_df.sort_values(by='start_time', inplace=True, ascending=True) # events are sorted chronologically
        student_df_len = len(student_df)
        for i in range(0, student_df_len, n): # jump n times at a time to make sure the subsequences are disjoint
            if i + n >= student_df_len: # if it exceeds the length of the array when jumping
                break
            next_n_rows = student_df.iloc[i:i+n]
            question_ids = next_n_rows['question_id'].tolist()
            answers = next_n_rows['answer_state'].tolist()
            # If all of answers are 1 or 0, then skip this data entry
            if sum(answers) == 0 or sum(answers) == len(answers):
                continue
            difficulties = [get_question_difficulty(q_id) for q_id in question_ids]
            hint_used = next_n_rows['hint_used'].tolist()
            data_entry = DataEntry(student_id, question_ids, answers, difficulties, hint_used)
            # Check if this has not been appended before
            if data_entry not in all_data_entries:
                all_data_entries.append(data_entry)
    return all_data_entries
    
# Subsequence length (window size)
N = 20 # change to 5, 10, and 20

# Extract all disjoint subsequences from the raw data
with tqdm(total=7) as pbar:
    all_train_fold1_entries = get_all_data_entries(train_fold_1_ids, N)
    pbar.update(1)
    all_train_fold2_entries = get_all_data_entries(train_fold_2_ids, N)
    pbar.update(1)
    all_train_fold3_entries = get_all_data_entries(train_fold_3_ids, N)
    pbar.update(1)
    all_val_fold1_entries = get_all_data_entries(val_fold_1_ids, N)
    pbar.update(1)
    all_val_fold2_entries = get_all_data_entries(val_fold_2_ids, N)
    pbar.update(1)
    all_val_fold3_entries = get_all_data_entries(val_fold_3_ids, N)
    pbar.update(1)
    all_test_data_entries = get_all_data_entries(test_ids, N)
    pbar.update(1)

100%|██████████| 7/7 [00:25<00:00,  3.60s/it]


In [49]:
import json
import numpy as np
class NpEncoder(json.JSONEncoder): # to fix the "Object of type 'int64' is not JSON serializable" error: Source: https://stackoverflow.com/a/57915246
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [50]:
# Save to JSON files
with open(f'data_outputs/train_fold_1_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_train_fold1_entries], f, cls=NpEncoder)
with open(f'data_outputs/train_fold_2_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_train_fold2_entries], f, cls=NpEncoder)
with open(f'data_outputs/train_fold_3_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_train_fold3_entries], f, cls=NpEncoder)
with open(f'data_outputs/val_fold_1_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_val_fold1_entries], f, cls=NpEncoder)
with open(f'data_outputs/val_fold_2_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_val_fold2_entries], f, cls=NpEncoder)
with open(f'data_outputs/val_fold_3_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_val_fold3_entries], f, cls=NpEncoder)
with open(f'data_outputs/test_n_{N}.json', 'w') as f:
    json.dump([entry.__dict__ for entry in all_test_data_entries], f, cls=NpEncoder)