# Exclude Outlier Users

This notebook filter out users who spent too little time or too much time on a specified course. Filtering is done using IQR.

# 1. Initialize paths and file names

In [1]:
EVENT_DATA_FOLDER_PATH = 'course_data/EE101J/csv_event'
OUTLIERS_FILE_PATH = 'course_data/EE101J/outliers.json'

# 2. Load event data

In [2]:
import os
import pandas as pd


event_data_files = [f for f in os.listdir(EVENT_DATA_FOLDER_PATH) if f.endswith('.csv')]
event_data_file_paths = [os.path.join(EVENT_DATA_FOLDER_PATH, f) for f in event_data_files]
event_data_df = pd.concat([pd.read_csv(file) for file in event_data_file_paths], ignore_index=True)

print(event_data_df.head(5))

  event_data_df = pd.concat([pd.read_csv(file) for file in event_data_file_paths], ignore_index=True)


   Unnamed: 0                                  username  \
0       35873  0005de4615ec64cda7e99baee116cabe660793f1   
1       35872  0005de4615ec64cda7e99baee116cabe660793f1   
2       35871  0005de4615ec64cda7e99baee116cabe660793f1   
3       35870  0005de4615ec64cda7e99baee116cabe660793f1   
4       35869  0005de4615ec64cda7e99baee116cabe660793f1   

                                          event_type  \
0                    edx.course.enrollment.activated   
1  /api/courseware/course/course-v1:TokyoTechX+EE...   
2  /courses/course-v1:TokyoTechX+EE101Jx+1T2021/x...   
3  /api/discussion/v1/courses/course-v1:TokyoTech...   
4  /courses/course-v1:TokyoTechX+EE101Jx+1T2021/x...   

                               time event_source  \
0  2023-07-22T14:38:42.341829+00:00       server   
1  2023-07-22T14:38:59.729191+00:00       server   
2  2023-07-22T14:39:00.592172+00:00       server   
3  2023-07-22T14:39:01.014682+00:00       server   
4  2023-07-22T14:39:02.475764+00:00       server

### Calculate course duration

In [3]:
event_data_df['time'] = pd.to_datetime(event_data_df['time'], utc=True, format='mixed')
user_course_duration = event_data_df.groupby(['username'])['time'].agg(['min', 'max'])
user_course_duration['course_duration'] = user_course_duration['max'] - user_course_duration['min']

user_course_duration

Unnamed: 0_level_0,min,max,course_duration
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0005de4615ec64cda7e99baee116cabe660793f1,2023-07-22 14:38:42.341829+00:00,2023-07-22 14:48:11.464167+00:00,0 days 00:09:29.122338
000872107dd25789c69483a852b5e7e1b9585ccf,2022-05-31 06:51:46.751846+00:00,2022-05-31 14:25:21.134841+00:00,0 days 07:33:34.382995
0009315b4e3b01daecd17d82a74ed875a551b2b4,2022-11-09 00:24:34.445086+00:00,2022-11-09 00:24:34.445086+00:00,0 days 00:00:00
000ac6408d4ecbf008bdfb846cf43ea51d1a2f78,2022-04-10 00:02:02.188000+00:00,2022-09-06 00:42:29.811270+00:00,149 days 00:40:27.623270
000d139975b412c58ea9308aec381e290138f7e2,2023-02-12 18:01:04.516226+00:00,2023-02-12 18:02:25.990000+00:00,0 days 00:01:21.473774
...,...,...,...
ffe9b9e01e6ff3da21ae812fde35be593854e640,2022-01-22 19:56:04.365547+00:00,2022-01-22 19:59:45.782703+00:00,0 days 00:03:41.417156
ffe9d80f9491288361c480ef9feab465fc578e77,2022-06-23 09:31:38.692404+00:00,2022-06-26 21:30:23.202685+00:00,3 days 11:58:44.510281
fff2f7b2afa72cff8222120dd8cd07eab7528a75,2022-09-01 13:27:06.604277+00:00,2022-11-30 14:30:29.391242+00:00,90 days 01:03:22.786965
fffa7f234e502b8bbc968b4dac2bf950cd352f94,2023-05-25 06:51:43.397516+00:00,2023-05-26 04:57:02.513000+00:00,0 days 22:05:19.115484


In [4]:
result = user_course_duration[['course_duration']].reset_index()
result

Unnamed: 0,username,course_duration
0,0005de4615ec64cda7e99baee116cabe660793f1,0 days 00:09:29.122338
1,000872107dd25789c69483a852b5e7e1b9585ccf,0 days 07:33:34.382995
2,0009315b4e3b01daecd17d82a74ed875a551b2b4,0 days 00:00:00
3,000ac6408d4ecbf008bdfb846cf43ea51d1a2f78,149 days 00:40:27.623270
4,000d139975b412c58ea9308aec381e290138f7e2,0 days 00:01:21.473774
...,...,...
9116,ffe9b9e01e6ff3da21ae812fde35be593854e640,0 days 00:03:41.417156
9117,ffe9d80f9491288361c480ef9feab465fc578e77,3 days 11:58:44.510281
9118,fff2f7b2afa72cff8222120dd8cd07eab7528a75,90 days 01:03:22.786965
9119,fffa7f234e502b8bbc968b4dac2bf950cd352f94,0 days 22:05:19.115484


In [5]:
#Initial number of users
len(result)

9121

### Filter out outliers using IQR

In [6]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = result['course_duration'].quantile(0.25)
Q3 = result['course_duration'].quantile(0.75)
IQR = Q3 - Q1
print(f"Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")

# Define upper and lower bounds (typically 1.5 * IQR from Q1 and Q3)
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = result[(result['course_duration'] < lower_bound) | (result['course_duration'] > upper_bound)]


Q1: 0 days 00:01:22.994014, Q3: 7 days 08:08:25.978253, IQR: 7 days 08:07:02.984239


In [7]:
# Number of outlier users
len(outliers)

1628

In [8]:
outliers.head(5)

Unnamed: 0,username,course_duration
3,000ac6408d4ecbf008bdfb846cf43ea51d1a2f78,149 days 00:40:27.623270
5,0022633f297898efdbe3b8b2fb58747dc3a6f42c,390 days 18:23:18.299518
8,0038cb908eeed43f9823b37ad54ca6db5cd80c57,48 days 14:40:36.807203
22,007a990e316d8d28bfac1c93a9a0be56c19ea4a3,703 days 13:33:15.387813
28,008b9e1039af087d106d7cf6ac24acddfeb1f6c2,159 days 10:58:12.935101


# 3. Save list of outlier users

In [9]:
outliers['username'].to_json(OUTLIERS_FILE_PATH, orient='values', indent=2)
