# EDA: Sychrony detection for classroom activities

File: 'core_classroom_analysis.py'

Checkpoint: 1

## Load data

In [1]:
pkl_filepath = '/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/core_classroom_analysis_cp1.pkl'

In [2]:
from pprint import pprint
import pandas as pd
import numpy as np
import datetime
from sklearn.cluster import AgglomerativeClustering, OPTICS, DBSCAN

from dask import dataframe as dd
from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

pbar = ProgressBar()
pbar.register()

In [3]:
# Load checkpoint shelve
import pickle
with open(pkl_filepath, 'rb') as f:
    inparams = pickle.load(f)

user_activity_blocks_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_user_activity_blocks_df.pkl')
jos_users = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_jos_users.pkl')
toolrun_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_toolrun_df.pkl')
cluster_output_candidate = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_cluster_output_candidate.pkl')
detected_clusters_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_detected_clusters_df.pkl')

In [4]:
cluster_output_candidate

Unnamed: 0,user,tool,start,end,ip,lon,lat,cluster,scanned_date,user_meet_class_size_min
168,samiaalam94,nanowire,2018-04-03,2018-04-07,119.148.44.254,90.375,23.7,0,2018-04-06,True
169,samratewu,nanowire,2018-04-04,2018-04-10,103.86.109.173,90.4109,23.7908,0,2018-04-06,True
170,iftakhar42,nanowire,2018-04-06,2018-04-10,103.204.244.6,90.4277,23.7418,0,2018-04-06,True
172,saikatsaha95,nanowire,2018-04-06,2018-04-10,43.230.120.201,90.4279,23.7525,0,2018-04-06,True
173,moriom.akter,nanowire,2018-04-03,2018-04-07,45.248.146.2,90.4276,23.7415,0,2018-04-06,True
...,...,...,...,...,...,...,...,...,...,...
92290,syarafifaz98,pntoy,2018-04-27,2018-05-01,175.145.180.207,101.695,3.1492,1,2018-05-01,True
92291,zayan1998,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True
92292,atiqahkhaishah,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True
92293,tokchinkuan,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True


## Intra-cluster Sync

In [5]:
def get_toolrun_vector(this_user_toolrun, cluster_date, sigma, all_tool_names):
    '''
    Given the user's toolrun history, cluster's datetime, and all tools used by this cluster's users,
    get the toolrun vector for this user
    '''

    # apply Guassian filter to toolrun
    normal_df = this_user_toolrun.groupby('toolname').apply(lambda x: \
                                                 np.exp(-1*(x.date-cluster_date).astype('timedelta64[D]').to_numpy()**2/sigma) \
                                                 ) \
                                           .apply(np.sum)    

    # TEST: No Guassian filter
    normal_df = this_user_toolrun.groupby('toolname').user.count()
    
    # form normalized vector
    normal_df = normal_df.reindex(all_tool_names, fill_value=0)
    v_length = np.linalg.norm(normal_df)
    normal_df = normal_df/v_length if v_length > 0 else None

    return normal_df
    

In [6]:
def intra_cluster_synchrony(this_cluster, toolrun_df):
    '''
    Reject any candidate within the cluster that is out-of-sync with others.
    '''
    # find the 2 sdev dates on left and right side tails of Gaussian
    cluster_date = this_cluster.name[0]
    #display('----- cluster_date:', cluster_date)
    #display('----- cluster_tool:', this_cluster.name[1])
    sigma = 10 # days

    start_datetime = cluster_date - datetime.timedelta(days=sigma)
    end_datetime = cluster_date + datetime.timedelta(days=sigma)

    # get each user's timeline behavior
    toolrun_within_range_df = toolrun_df[ \
                                          (toolrun_df.date >= start_datetime) & \
                                          (toolrun_df.date <= end_datetime)\
                                         ]

    this_user_set = this_cluster.user.unique()
    is_user_within_cluster = toolrun_within_range_df.apply(lambda x: x.user in this_user_set, axis=1)
    if not is_user_within_cluster.empty:
        this_cluster_users_toolrun = toolrun_within_range_df[is_user_within_cluster] \
                                    .sort_values(by=['user','date'])
    else:
        return
    
    this_cluster_all_tools = this_cluster_users_toolrun.toolname.unique()    

    # for each user, calculate its sychrony
    tool_vector = this_cluster_users_toolrun.groupby('user').apply(get_toolrun_vector, \
                                                     cluster_date=cluster_date, sigma=sigma, \
                                                     all_tool_names=this_cluster_all_tools)

    # remove all-zero rows
    '''
    display('----- this_cluster_users_toolrun')
    display(this_cluster_users_toolrun)
    display('----- tool_vector')
    display(tool_vector)
    display('----- tool_vector_2')
    '''
    tool_vector = tool_vector[~tool_vector[tool_vector.columns[0]].isna()]

    # clustering
    cluster = DBSCAN(min_samples=2, eps=0.6)
    cluster_result = cluster.fit_predict(tool_vector.to_numpy())

    tool_vector['_group'] = cluster_result
    #display(tool_vector.sort_index(axis=1))
    
    
    this_cluster['DBSCAN'] = cluster_result
    
    return this_cluster
    

In [7]:
def intra_cluster_synchrony_pregroup(this_cluster_group, toolrun_df):
    '''
    Buffer function between Dask and actual synchrony computation to have flexible control of parallelism
    '''
    
    # get DBSCAN intra-cluster refinement
    this_result = this_cluster_group.groupby(['scanned_date', 'tool', 'cluster']).apply(intra_cluster_synchrony, toolrun_df=toolrun_df)
    
    # remove all -1 non-cluster members
    if not this_result.empty:
        this_result = this_result[this_result.DBSCAN > -1]
    
    # remove all sub-groups (cluster, DBSCAN) that are smaller than minimal size requirement
    
    return this_result
    

In [None]:
#test = cluster_output_candidate[cluster_output_candidate.tool=='pntoy'] \
#            .groupby('tool') \
#            .apply(intra_cluster_synchrony_pregroup, toolrun_df=toolrun_df)

cluster_output_candidate['DBSCAN'] = -1

ddata = dd.from_pandas(cluster_output_candidate, npartitions=30) \
        .groupby('scanned_date') \
        .apply(intra_cluster_synchrony_pregroup, \
               toolrun_df=toolrun_df, \
               meta = cluster_output_candidate) \
        .compute(scheduler=inparams[0].dask_scheduler, num_workers=20)


In [14]:
'user' in cluster_output_candidate.columns

True