<a href="https://colab.research.google.com/github/rithikkulkarni/simstudent-data-preprocessing/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [43]:
import pandas as pd

file_path = 'https://github.com/rithikkulkarni/simstudent-data-preprocessing/raw/main/datasets/Algebra2017May(YesPrep)-APLUS-SUBSET.xlsx'

df = pd.read_excel(file_path, sheet_name="Algebra2017May(YesPrep)-APLUS", engine="openpyxl")
print(df.shape)

KeyboardInterrupt: 

In [44]:
### Loading the already filtered dataset from github
file_path = 'https://github.com/rithikkulkarni/simstudent-data-preprocessing/raw/main/datasets/SimStudent_StudyVII_May2017_Preprocessed_Subset.xlsx'

df = pd.read_excel(file_path)
print(df.shape)

(140943, 19)


In [45]:
unique_students = df['Anon Student Id'].nunique()
print(f'# of Unique Students: {unique_students}')

KeyError: 'Anon Student Id'

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140943 entries, 0 to 140942
Data columns (total 62 columns):
 #   Column                       Non-Null Count   Dtype         
---  ------                       --------------   -----         
 0   Row                          140943 non-null  int64         
 1   Sample Name                  140943 non-null  object        
 2   Transaction Id               140943 non-null  object        
 3   Anon Student Id              140943 non-null  object        
 4   Session Id                   140943 non-null  object        
 5   Time                         140943 non-null  datetime64[ns]
 6   Time Zone                    140943 non-null  object        
 7   Duration (sec)               140943 non-null  object        
 8   Student Response Type        140943 non-null  object        
 9   Student Response Subtype     63907 non-null   object        
 10  Tutor Response Type          136474 non-null  object        
 11  Tutor Response Subtype    

In [11]:
### REMOVAL OF UNWANTED COLUMNS (RUN ONLY ONCE)
df = df.drop([
  'Feedback Classification',
  'Total Num Hints',
  'Condition Type',
  'KC Category (Single-KC)',
  'KC Category (Unique-step)',
  'Unnamed: 60',
  'Unnamed: 61',
  'Row',
  'Sample Name',
  'Transaction Id',
  'School',
  'Time Zone',
  'Time',
  'Tutor Response Type',
  'Tutor Response Subtype',
  'Level (Unit)',
  'Problem Name',
  'Step Name',
  'Selection',
  'Action',
  'KC (Problem)',
  'KC (Step)',
  'KC Category (Step)',
  'KC (ActualSkill)',
  'KC Category (ActualSkill)',
  'KC (Default)',
  'KC Category (Default)',
  'KC (ProblemSubmit)',
  'KC Category (ProblemSubmit)',
  'KC (Single-KC)',
  'KC (Unique-step)',
  'Class',
  'CF (HINT_FOLLOWED)',
  'CF (HINT_SUBJECT)',
  'CF (HINT_TYPE)',
  'CF (INFO)',
  'CF (RESULT)',
  'CF (USERID)',
  'CF (STUDENT_LOG_COUNT)',
  'CF (date)',
  'Participation Day Key',
  'Student Response Subtype',
  'Problem Start Time'
], axis=1)

In [12]:
# Renaming columns in preparation for dataset export
df = df.rename(columns={
  'Anon Student Id': 'student_id',
  'Session Id': 'session_id',
  'Duration (sec)': 'duration_sec',
  'Student Response Type': 'student_response_type',
  'Problem View': 'problem_view',
  'Attempt At Step': 'attempt_at_step',
  'Is Last Attempt': 'is_last_attempt',
  'Outcome': 'outcome',
  'Input': 'input',
  'Feedback Text': 'feedback_text',
  'Help Level': 'help_level',
  'Condition Name': 'condition_name',
  'KC Category (Problem)': 'problem_category',
  'CF (ACTION)': 'action_explanation',
  'CF (ACTION_TYPE)': 'action',
  'CF (DURATION)': 'duration',
  'CF (STATUS)': 'status',
  'Participation Day': 'participation_day',
  'CF (tool_event_time)': 'tool_event_time'
})

In [13]:
# Get a visual of the amount of unique values in each column

for col in df.columns:
  print(f'{col}: {df[col].nunique()}')

student_id: 266
session_id: 856
duration_sec: 1100
student_response_type: 2
problem_view: 25
attempt_at_step: 385
is_last_attempt: 2
outcome: 4
input: 1944
feedback_text: 8253
help_level: 4
condition_name: 2
problem_category: 8
action_explanation: 58
action: 12
duration: 2242
status: 3
participation_day: 4
tool_event_time: 140041


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140943 entries, 0 to 140942
Data columns (total 19 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   student_id             140943 non-null  object 
 1   session_id             140943 non-null  object 
 2   duration_sec           140943 non-null  object 
 3   student_response_type  140943 non-null  object 
 4   problem_view           140943 non-null  int64  
 5   attempt_at_step        136474 non-null  float64
 6   is_last_attempt        136474 non-null  float64
 7   outcome                136474 non-null  object 
 8   input                  68954 non-null   object 
 9   feedback_text          20420 non-null   object 
 10  help_level             5806 non-null    float64
 11  condition_name         140943 non-null  object 
 12  problem_category       86862 non-null   object 
 13  action_explanation     138286 non-null  object 
 14  action                 138286 non-nu

In [15]:
df.to_excel('SimStudent_StudyVII_May2017_Preprocessed_Subset.xlsx', index=False)

# Filtering done. Now for sorting and visualizing

In [46]:
### Convert UTC times to EST

import pytz
from datetime import datetime

utc = pytz.UTC
eastern = pytz.timezone("US/Eastern")

def convert_time(value):
    try:
        if pd.isna(value):  # check for NaT or None
            return pd.NaT

        value_str = str(value)

        if "UTC" in value_str:
            dt = datetime.strptime(value_str.replace(" UTC", ""), "%Y-%m-%d %H:%M:%S.%f")
            dt = utc.localize(dt).astimezone(eastern)
        else:
            dt = datetime.strptime(value_str, "%Y-%m-%d-%H:%M:%S.%f")
            dt = eastern.localize(dt)

        return dt

    except Exception as e:
        print(f"Failed to parse: {value} → {e}")
        return pd.NaT


df['tool_event_time'] = df['tool_event_time'].apply(convert_time)
df['tool_event_time_str'] = df['tool_event_time'].dt.strftime('%Y-%m-%d %H:%M:%S.%f')

### Create sorted dataframe
df.sort_values(by=['student_id','tool_event_time'], inplace=True)


In [47]:
df.head(50)

Unnamed: 0,student_id,session_id,duration_sec,student_response_type,problem_view,attempt_at_step,is_last_attempt,outcome,input,feedback_text,help_level,condition_name,problem_category,action_explanation,action,duration,status,participation_day,tool_event_time,tool_event_time_str
101,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,6.5,HINT_REQUEST,1,1.0,0.0,UNGRADED,,,,MetaTutorMC,OneStep,New Problem Entered,SIM_STUDENT_PROBLEM,,Tutoring,1.0,2017-05-18 12:46:43.216000-04:00,2017-05-18 12:46:43.216000
1807,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,2.333,HINT_REQUEST,1,21.0,1.0,UNGRADED,'-1,"It's a good strategy to quiz ron, because it w...",,MetaTutorMC,,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Tutoring,1.0,2017-05-18 12:48:19.028000-04:00,2017-05-18 12:48:19.028000
1850,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.333,HINT_REQUEST,1,9.0,0.0,UNGRADED,'-1,"It's a good strategy to quiz ron, because it w...",,MetaTutorMC,,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Tutoring,1.0,2017-05-18 12:49:15.338000-04:00,2017-05-18 12:49:15.338000
1873,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.667,HINT_REQUEST,1,11.0,0.0,UNGRADED,'3v+2,I see that ron failed all quiz items. ron can ...,,MetaTutorMC,,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Quiz,1.0,2017-05-18 12:50:10.578000-04:00,2017-05-18 12:50:10.578000
1964,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.333,HINT_REQUEST,1,1.0,0.0,UNGRADED,,,,MetaTutorMC,TwoStep,New Problem Entered,SIM_STUDENT_PROBLEM,,Tutoring,1.0,2017-05-18 12:51:36.452000-04:00,2017-05-18 12:51:36.452000
1987,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,1,HINT_REQUEST,1,51.0,1.0,UNGRADED,'3v+2,ron failed all quiz items. ron can do better w...,,MetaTutorMC,,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Tutoring,1.0,2017-05-18 12:59:24.810000-04:00,2017-05-18 12:59:24.810000
2023,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.2,HINT_REQUEST,1,50.0,0.0,UNGRADED,'3v+2,I see that ron failed all quiz items. ron can ...,,MetaTutorMC,TwoStep,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Tutoring,1.0,2017-05-18 12:59:33.820000-04:00,2017-05-18 12:59:33.820000
2090,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.333,HINT_REQUEST,1,1.0,0.0,UNGRADED,,,,MetaTutorMC,OneStep,New Problem Entered,SIM_STUDENT_PROBLEM,,Tutoring,1.0,2017-05-18 13:01:39.392000-04:00,2017-05-18 13:01:39.392000
2115,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.333,HINT_REQUEST,1,30.0,0.0,UNGRADED,'3v+2,I see that ron failed all quiz items. ron can ...,,MetaTutorMC,OneStep,Metatutor Hint Given,SIM_STUDENT_METATUTOR,,Tutoring,1.0,2017-05-18 13:05:31.247000-04:00,2017-05-18 13:05:31.247000
2133,Stu_00c8134cc1df496e45ccb744097b7896,L53ff777e-cdd2-4ffa-a23f-1b04311ff0a4,0.333,HINT_REQUEST,1,7.0,0.0,UNGRADED,,,,MetaTutorMC,OneStep,New Problem Entered,SIM_STUDENT_PROBLEM,,Tutoring,1.0,2017-05-18 13:06:58.955000-04:00,2017-05-18 13:06:58.955000
