# Data analysis

In [9]:
import numpy as np
import pandas as pd

from main import get_datasets

## Dropouts per round

In [10]:
data_instances = get_datasets(['cs0_web', 'cs1-b', 'cs1-s', 'wd'])
data_instances

{'cs1-s': <instances.SubmissionData.SubmissionData at 0x7fb381174cd0>,
 'cs1-b': <instances.BrowsingData.BrowsingData at 0x7fb38116eac0>,
 'cs0_web': <instances.SubmissionData.SubmissionData at 0x7fb381174d90>}

In [11]:
class Rows:
    dataset = 'Dataset'
    students = 'All'
    active = 'Active'
    inactive = 'Inactive'
    returning = 'Returning'
    dropouts = 'Dropped'

dropout_statistics_df = pd.DataFrame()
for name, data in data_instances.items():
    print(f'dataset {name}')
    print(f'total student count: {data.total_student_count}')
    print(f'number of rounds: {data.n_rounds}')
    if type(data).__name__ == "SubmissionData":
        print(f'number of assignments: {data.n_assignments}')
    
    students_per_round = {
        Rows.dataset: [],
        Rows.students: [],
        Rows.active: [],
        Rows.inactive: [],
        Rows.returning: [],
        Rows.dropouts: [],
    }
    
    for round in range(1, min(8, data.n_rounds + 1)):
        active_on_previous = data.get_active_on_previous(round)
        active_on_current = data.get_active_on_current(round)
        active_on_current_or_future = data.get_active_on_current_or_future(round)
        active_on_immediate_previous = data.get_active_on_immediate_previous(round)
        active_on_previous_but_not_immediate_previous = data.get_active_on_previous_but_not_immediate_previous(round)
        
        n_students = len(np.union1d(active_on_previous, active_on_current))
        n_dropouts = len(np.setdiff1d(active_on_previous, active_on_current_or_future))
        n_active = len(active_on_current)
        n_returning = np.isin(active_on_current, active_on_previous_but_not_immediate_previous).sum()
        
        students_per_round[Rows.dataset].append(name)
        students_per_round[Rows.students].append(n_students)
        students_per_round[Rows.active].append(n_active)
        students_per_round[Rows.inactive].append(n_students - n_active)
        students_per_round[Rows.returning].append(n_returning)
        students_per_round[Rows.dropouts].append(n_dropouts) # not active and not returning
    
    dropout_statistics_df = pd.concat([dropout_statistics_df, pd.DataFrame(students_per_round)])

dropout_statistics_df['Round'] = dropout_statistics_df.index + 1
dropout_statistics_df.head(10)

dataset cs1-s
total student count: 1059
number of rounds: 13
number of assignments: 217
dataset cs1-b
total student count: 1414
number of rounds: 8
dataset cs0_web
total student count: 73
number of rounds: 8
number of assignments: 76


Unnamed: 0,Dataset,All,Active,Inactive,Returning,Dropped,Round
0,cs1-s,960,960,0,0,0,1
1,cs1-s,976,824,152,0,134,2
2,cs1-s,978,774,204,9,177,3
3,cs1-s,981,779,202,15,185,4
4,cs1-s,986,764,222,5,200,5
5,cs1-s,988,743,245,7,216,6
6,cs1-s,992,737,255,12,230,7
0,cs1-b,1196,1196,0,0,0,1
1,cs1-b,1262,966,296,0,247,2
2,cs1-b,1305,896,409,26,346,3


### Creating a summary table for the paper

In [12]:
# import regex as re
import re

presentation_df = dropout_statistics_df.pivot(index=Rows.dataset, columns='Round').stack(0).fillna(-1).astype('int32').replace(-1, '')
presentation_df

Unnamed: 0_level_0,Round,1,2,3,4,5,6,7
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cs0_web,Active,72,68,68,48,36,33,25
cs0_web,All,72,73,73,73,73,73,73
cs0_web,Dropped,0,4,5,24,36,40,47
cs0_web,Inactive,0,5,5,25,37,40,48
cs0_web,Returning,0,0,1,0,0,1,0
cs1-b,Active,1196,966,896,860,849,823,815
cs1-b,All,1196,1262,1305,1327,1360,1378,1407
cs1-b,Dropped,0,247,346,418,471,518,582
cs1-b,Inactive,0,296,409,467,511,555,592
cs1-b,Returning,0,0,26,37,26,21,30


In [13]:
save_file = 'illustrations/dataset-stats.tex'

caption = '''
Statistics of student activity and inactivity for each round in our data.
Returning students are students who became active on a given round after a break in activity. That is,
returning students include students who have been active previously and since have been inactive until the given round.
Dropped out students are students who are inactive in the given round and remain inactive until the end of the course.
'''

latex = presentation_df.to_latex(caption=caption, label='tab:data-dropout-statistics')
latex = re.sub(r'\\(top|mid|bottom)+rule', '', latex) 
latex = latex.replace('\\centering', '\\small\\centering')

with open(save_file, 'w') as f:
    f.write(latex)
    print(f'wrote data stats latex to {save_file}')

wrote data stats latex to illustrations/dataset-stats.tex
