# Student Data Exploration

## Set up 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from collections import Counter
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
# import sys
# sys.version
# # 3.7.6
# # pd.__version__
# # # '1.1.4'
# # np.__version__
# # '1.18.5'

In [None]:
# Constants
# - par
ORDER_MONTH = ['2018-08','2018-09','2018-10','2018-11','2018-12',
               '2019-01','2019-02','2019-03','2019-04','2019-05','2019-06','2019-07']
VARS_REDUNDANT = ['is_downgrade','is_upgrade']
# - control which parts of the notebook to run
RUN_OVERALL = True
RUN_BY_GRADE_MONTH = True
RUN_BY_IND_MONTH = True
PLOT = True
PLATFORM_EFF = True
MONTH_EXCLUDED = ['2018-08','2019-08']

# -- False for reading only a small subset of the full data
RUN_FULL = True
# - path
PATH_INPUT = '/kaggle/input/learning-activity-public-dataset-by-junyi-academy/'
PATH_PREPROCESSED_INPUT = '../input/junyi-preprocessed/'
PATH_OUTPUT = '/kaggle/working/'
# - file
# -- raw timestamp
FILE_LOG_FULL = os.path.join(PATH_PREPROCESSED_INPUT ,'Log_Problem_raw_timestamp.parquet.gzip')
# -- rounded timestamp
# FILE_LOG_FULL = os.path.join(PATH_INPUT,'Log_Problem.csv')
FILE_LOG_SMALL = os.path.join(PATH_OUTPUT,'Log_Problem_n1000.csv')
FILE_USER = os.path.join(PATH_INPUT,'Info_UserData.csv')

In [None]:
# Read in the log file
if RUN_FULL:
#     df_log = pd.read_csv(FILE_LOG_FULL)  
    df_log = pd.read_parquet(FILE_LOG_FULL)
else:
    df_log = pd.read_csv(FILE_LOG_SMALL)   
# Read in the user file
df_user = pd.read_csv(FILE_USER)

In [None]:
print(df_log.shape)
print(df_user.shape)
print(df_log.timestamp_TW.min())
print(df_log.timestamp_TW.max())

In [None]:
# # Create a subset of the df for dev purpose
# df_log_small = df_log.head(n=1000)
# df_log_small.to_csv(FILE_LOG_SMALL)

In [None]:
# Preprocessing
# - drop redundant columns
df_log = df_log.drop(columns = VARS_REDUNDANT)   
# - create variables
# -- create a 'year-month' variable for grouping purpose
df_log['year_month'] = df_log['timestamp_TW'].str[:7]

In [None]:
# join the "user_grade" info
df_log = pd.merge(df_log,df_user[['uuid','user_grade']],on='uuid',how='left')

## Descriptive Statistics: Overall

In [None]:
if RUN_OVERALL:
    # histogram: the distribution of grade of the unique students
    plt.figure(figsize=(20,5))
    g = sns.countplot(x='user_grade', data = df_user)
    g.set(xlabel='Grade', ylabel='Frequency',title = 'The distribution of the number of unique students by grade')
    plt.show()

In [None]:
if RUN_OVERALL:
    # bar plot: Average number of logs of each unique student for each grade 
    df_by_grade = df_log.groupby(by=['user_grade']).agg(n_logs = ('uuid','count')).reset_index()
    # - add the 'n_unique_students' column
    df_by_grade = df_by_grade.merge(df_user.user_grade.value_counts().to_frame(name='n_unique_students').reset_index().rename(columns={"index":"user_grade"}),
                                    on = 'user_grade')
    df_by_grade['logs_per_students'] = df_by_grade['n_logs']/df_by_grade['n_unique_students']
    plt.figure(figsize=(20,5))
    g = sns.barplot(x='user_grade', y = 'logs_per_students', data = df_by_grade)
    g.set(xlabel='Grade', ylabel='# logs per student',title = 'Distribution of average number of logs of each student by grade')
    plt.show()

In [None]:
if RUN_OVERALL:
    # histogram: the distribution of month of the log activities
    plt.figure(figsize=(20,5))
    g = sns.countplot(x='year_month', data = df_log,order = ORDER_MONTH)
    g.set(xlabel='Month', ylabel='Frequency',title = 'The distribution of the number of log activities by month')
    plt.show()

## Descriptive Statistics: By-month & By-grade 

1. To make the graphs clearer, I have removed the data during 2018-08 from the descriptive figures below.

In [None]:
if RUN_BY_GRADE_MONTH:
    # - create the df
    df_by_month_grade = df_log.groupby(by=['year_month','user_grade']).agg(accuracy = ('is_correct','mean'),
                                                                         n_logs = ('uuid','count'),
                                                                         n_unique_students = ('uuid','nunique')).reset_index()

In [None]:
if RUN_BY_GRADE_MONTH:
    # Bar plot: by-month by-grade number of logs
    plt.figure(figsize=(20,5))
    g = sns.barplot(x="year_month", y="n_logs", hue="user_grade", 
                    data=df_by_month_grade[~df_by_month_grade['year_month'].isin(MONTH_EXCLUDED)])
    g.set(xlabel='Month', ylabel='# Logs')
    plt.show()

In [None]:
if RUN_BY_GRADE_MONTH:
    # Bar plot: by-month by-grade number of unique users
    plt.figure(figsize=(20,5))
    g = sns.barplot(x="year_month", y="n_unique_students", hue="user_grade",
                    data=df_by_month_grade[~df_by_month_grade['year_month'].isin(MONTH_EXCLUDED)])
    g.set(xlabel='Month', ylabel='# Logs')
    plt.show()

In [None]:
if RUN_BY_GRADE_MONTH:
    # Line plot: Create a by-month by-grade average accuracy line plot
    # df_by_month_all.rename(columns={'is_correct':'accuracy'}, inplace=True)
    # - create the line plot
    plt.figure(figsize=(20,5))
    g = sns.lineplot(data=df_by_month_grade[~df_by_month_grade['year_month'].isin(MONTH_EXCLUDED)], 
                     x='year_month',
                     y='accuracy',
                     hue='user_grade')
    g.set(xlabel='Month', ylabel='Accuracy')
    plt.show()

## Descriptive Statistics: By-month By-individual 

- Basis for "student performance prediction".

### By-month by-individual average absolute accuracy (AAA)

$$AAA = \frac{\text{# correct attemps in a month}}{\text{# attempts in a month}}$$

In [None]:
if RUN_BY_IND_MONTH:
    # create the by-month by-individual AAA variable
    df_by_month_ind = df_log.groupby(by=['year_month','uuid']).agg(n_logs = ('is_correct','count'),
                                                                   accuracy = ('is_correct','mean'),
                                                                   user_grade = ('user_grade','first')).reset_index()
    # exclude the first and last month
    df_by_month_ind = df_by_month_ind[~df_by_month_ind['year_month'].isin(MONTH_EXCLUDED)]

In [None]:
if RUN_BY_IND_MONTH:
    # only retain those with at least 15 logs for every monthx`
    MIN_LOGS_MONTH = 15
    # - remove rows (user-month) below the threshold
    df_by_month_ind = df_by_month_ind[df_by_month_ind.n_logs>=MIN_LOGS_MONTH]

# - Before removal
# df_by_month_ind.uuid.nunique()
# > 70683
# - After removal
# df_by_month_ind.uuid.nunique()
# > 53995

In [None]:
if RUN_BY_IND_MONTH:
    # check how many months has each student been active
    df_user_active_months = df_by_month_ind.groupby(by=['uuid']).agg(n_active_months = ('year_month','nunique'))
    # join the active months info to `df_by_month_ind`
    df_by_month_ind = df_by_month_ind.merge(df_user_active_months,on='uuid')

In [None]:
if RUN_BY_IND_MONTH:
    # Note: there are only 317 users that are active across all 11 months
    print(sorted(Counter(df_user_active_months['n_active_months']).items(),key = lambda i: i[0]))

In [None]:
if RUN_BY_IND_MONTH:
    if PLOT:
        # histogram: the distribution of number of active months of users
        plt.figure(figsize=(20,5))
        g = sns.countplot(x='n_active_months', data = df_user_active_months)
        g.set(xlabel='Number of Active Months', ylabel='Frequency',title = 'The distribution of number of active months of students')
        plt.show()

In [None]:
if RUN_BY_IND_MONTH:
    if PLOT:
        # Line plot: Create a by-month by-individual average accuracy line plot for students with 11 active months
        plt.figure(figsize=(20,5))
        g = sns.lineplot(data=df_by_month_ind.query('n_active_months == 11'), 
                         x='year_month',
                         y='accuracy',
                         hue='uuid',
                         legend = False)
        g.set(xlabel='Month', ylabel='Accuracy')
        plt.show()

### By-month by-individual average relative average accuracy (RAA)
$$ RAA = z(AAA) = \frac{\text{AAA - mean AAA  of the same grade}}{\text{SD AAA of the same grade}} $$

In [None]:
if RUN_BY_IND_MONTH:
    # group average accuracy
    df_by_month_accurcy = df_log.groupby(['year_month','user_grade']).agg(month_accuracy_mean = ('is_correct','mean'),
                                                                          month_accuracy_sd = ('is_correct','std'))

In [None]:
if RUN_BY_IND_MONTH:
    df_by_month_ind = df_by_month_ind.merge(df_by_month_accurcy, on = ['year_month','user_grade'])
    df_by_month_ind['relative_accuracy'] = (df_by_month_ind['accuracy'] - df_by_month_ind['month_accuracy_mean'])/df_by_month_ind['month_accuracy_sd']

In [None]:
if RUN_BY_IND_MONTH:
    if PLOT:
        # Line plot: Create a by-month by-individual average accuracy line plot for students with 11 active months
        plt.figure(figsize=(20,5))
        g = sns.lineplot(data=df_by_month_ind.query('n_active_months == 11'), 
                         x='year_month',
                         y='relative_accuracy',
                         hue='uuid',
                         legend = False)
        g.set(xlabel='Month', ylabel='Relative Accuracy')
        plt.show()

# Platform Effectiveness Evaluation


## Get the accuracy of each user's last and first month activities

In [None]:
if PLATFORM_EFF:
    # create variables
    # create 'year', 'month' variable for grouping purpose
    df_log['year'] = df_log['timestamp_TW'].apply(lambda x: int(x[:4]))
    df_log['month'] = df_log['timestamp_TW'].apply(lambda x: int(x[5:7]))
    df_log.drop('timestamp_TW', axis=1)

In [None]:
if PLATFORM_EFF:
    # count the number of logs of each users
    df_log_count_by_uuid = df_log.groupby(by='uuid').agg(n_logs = ('uuid','count')).reset_index()
    df_log = pd.merge(df_log, df_log_count_by_uuid , on='uuid',how='left')

In [None]:
if PLATFORM_EFF:
    # Make df of user's log containing the log of most last and first activity
    df_last_by_uuid = df_log.sort_values(['year', 'month'], ascending=[True, True]).groupby('uuid').tail(1)[['uuid', 'year', 'month']]
    df_last_by_uuid = df_last_by_uuid.rename(columns={'year':'last_year','month':'last_month' })
    df_first_by_uuid = df_log.sort_values(['year', 'month'], ascending=[False, False]).groupby('uuid').tail(1)[['uuid', 'year', 'month']]
    df_first_by_uuid = df_first_by_uuid.rename(columns={'year':'first_year','month':'first_month' })
    df_last_first_by_uuid = pd.merge(df_last_by_uuid, df_first_by_uuid, on='uuid', how = 'left')

In [None]:
if PLATFORM_EFF:
    # only select uuid with at least 2 months activity
    df_active_uuid = df_last_first_by_uuid[(df_last_first_by_uuid['last_year']!=df_last_first_by_uuid['first_year']) | (df_last_first_by_uuid['last_month']!=df_last_first_by_uuid['first_month'])]

    # join the "last_year, last_month", "first_year, first_month" and "log_problem'" info
    merged_df_log = pd.merge(df_log, df_active_uuid , on='uuid',how='right')

In [None]:
N_LOG = 2000

if PLATFORM_EFF:
    # only select uuid with n_logs > N_LOG
    merged_df_log = merged_df_log[merged_df_log['n_logs']>N_LOG]

In [None]:
if PLATFORM_EFF:
    # Compute AAA and RAA of user's last month activities

    df_log_last = merged_df_log[(merged_df_log['year'] == merged_df_log['last_year']) & (merged_df_log['month'] == merged_df_log['last_month'])]

    # AAA
    df_log_last_gb = df_log_last.groupby(['uuid']).agg(n_logs = ('is_correct', 'count'), accuracy = ('is_correct', 'mean'), user_grade = ('user_grade','first')).reset_index()

    # RAA
    df_log_last_grade_gb = df_log_last.groupby(['user_grade']).agg(grade_accuracy_mean = ('is_correct','mean'), grade_accuracy_std = ('is_correct','std'))
    df_log_last_gb = pd.merge(df_log_last_gb, df_log_last_grade_gb, on = 'user_grade', how = 'left')
    df_log_last_gb['relative_accuracy'] = (df_log_last_gb['accuracy']-df_log_last_gb['grade_accuracy_mean'])/df_log_last_gb['grade_accuracy_std']
    df_log_last_gb = df_log_last_gb.drop(columns = ['grade_accuracy_mean','grade_accuracy_std'])
    df_log_last_gb = df_log_last_gb.rename(columns={'accuracy':'last_AAA','relative_accuracy':'last_RAA' })
    df_log_last_gb.head()

In [None]:
if PLATFORM_EFF:
    # Compute AAA and RAA of user's first month activities
    df_log_first = merged_df_log[(merged_df_log['year'] == merged_df_log['first_year']) & (merged_df_log['month'] == merged_df_log['first_month'])]

    # AAA
    df_log_first_gb = df_log_first.groupby(['uuid']).agg(n_logs = ('is_correct', 'count'), accuracy = ('is_correct', 'mean'), user_grade = ('user_grade','first')).reset_index()

    # RAA
    df_log_first_grade_gb = df_log_first.groupby(['user_grade']).agg(grade_accuracy_mean = ('is_correct','mean'), grade_accuracy_std = ('is_correct','std'))
    df_log_first_gb = pd.merge(df_log_first_gb, df_log_first_grade_gb, on = 'user_grade', how = 'left')
    df_log_first_gb['relative_accuracy'] = (df_log_first_gb['accuracy']-df_log_first_gb['grade_accuracy_mean'])/df_log_first_gb['grade_accuracy_std']
    df_log_first_gb = df_log_first_gb.drop(columns = ['grade_accuracy_mean','grade_accuracy_std'])
    df_log_first_gb = df_log_first_gb.rename(columns={'accuracy':'first_AAA','relative_accuracy':'first_RAA' })
    df_log_first_gb.head()

In [None]:
if PLATFORM_EFF:
    # Compute the difference of last and first AAA and RAA
    df_user_accuracy = pd.merge(df_log_last_gb, df_log_first_gb[['uuid','first_AAA', 'first_RAA']], on='uuid', how='left')
    df_user_accuracy = df_user_accuracy.drop(columns = ['n_logs','user_grade'])
    df_user_accuracy['d_AAA'] = df_user_accuracy['last_AAA'] - df_user_accuracy['first_AAA']
    df_user_accuracy['d_RAA'] = df_user_accuracy['last_RAA'] - df_user_accuracy['first_RAA']
    df_user_accuracy.head()

    df_user_new = pd.merge(df_user, df_user_accuracy, on='uuid', how='right')

## Compute the correlation between energy point and user's accuracy measures

In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [None]:
if PLATFORM_EFF:
    if PLOT:
        df_user_new.dataframeName = 'Energy Points and Accuracy'
        plotCorrelationMatrix(df_user_new, 12)

In [None]:
if PLATFORM_EFF:
    if PLOT:
        df_user_new2 = pd.merge(df_user_accuracy, df_user[['uuid','points']], on='uuid', how='left')
        df_user_new2.dataframeName = 'Energy Points and Accuracy'
        plotCorrelationMatrix(df_user_new2,8)

In [None]:
print(df_user_new2.corr())