# EDA: Sychrony detection for classroom activities

File: 'core_classroom_analysis.py'

Checkpoint: 1

## Load data

In [2]:
pkl_filepath = '/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/core_classroom_analysis_cp1.pkl'

In [3]:
from pprint import pprint
import pandas as pd
import numpy as np
import logging
import datetime

from dask import dataframe as dd
from dask.multiprocessing import get
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

In [4]:
# Load checkpoint shelve
import pickle
with open(pkl_filepath, 'rb') as f:
    inparams = pickle.load(f)
    
class_cluster_candidate = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_class_cluster_candidate.pkl')
user_activity_blocks_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_user_activity_blocks_df.pkl')
jos_users = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_jos_users.pkl')
toolrun_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_toolrun_df.pkl')
cluster_output_candidate = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_cluster_output_candidate.pkl')
detected_clusters_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_detected_clusters_df.pkl')


## Geospatial clustering

In [45]:
def form_cluster_blocks(tool_clusters_df):
    '''
    Given a tool's clusters from all users, join neighboring clusters that
    shares one or more common users
    '''
    
    all_scanned_dates = np.sort(tool_clusters_df['scanned_date'].unique())
    
    # list of dict [{'tool': 'pntoy', 'start':datetime, 'end':datetime, users': list()}, .....]
    all_clusters = list()
    
    if len(all_scanned_dates) > 0:
        last_update_date = all_scanned_dates[0]
    
    for index, this_date in enumerate(all_scanned_dates):
        # for each scanned date
        clusters_in_this_date = tool_clusters_df[tool_clusters_df.scanned_date == this_date]

        cluster_ids = clusters_in_this_date['cluster'].unique()
        
        for this_cluster_id in cluster_ids:
            # for each cluster ID
            this_cluster_df = clusters_in_this_date[clusters_in_this_date.cluster == this_cluster_id]
            
            # see if it can be aggregated with one of the candidates clusters
            this_all_users = set(this_cluster_df.user)
            cluster_matched = False
            
            for this_candidate in all_clusters:
                if index > 0:
                    if this_candidate['last_update'] != all_scanned_dates[index-1]:
                        # only append to clusters that is active in previous, adjacent date
                        continue
                
                this_candidate_users = tool_clusters_df.loc[this_candidate['users_row_id']]['user']
                if this_all_users & set(this_candidate_users):                    
                    # match
                    this_candidate['last_update'] = this_date
                    this_candidate['users_row_id'] = this_candidate['users_row_id'].append(this_cluster_df.index)
                    
                    cluster_matched = True
                    break
            
            if not cluster_matched:
                # no match found, insert this cluster as new into all_candidates_clusters
                all_clusters.append({'last_update':this_date, 'users_row_id':this_cluster_df.index})
                #display('adding '+str(len(this_cluster_df.index))+' rows')

    
    # turn into a dataframe
    all_clusters_df = pd.DataFrame(all_clusters)
    all_clusters_df.drop('last_update',axis=1,inplace=True)
    
    # find the earliest start and latest end of all users within cluster
    all_clusters_df['start'] = all_clusters_df.apply(lambda x: tool_clusters_df.loc[x.users_row_id].start.min(), axis=1)
    all_clusters_df['end'] = all_clusters_df.apply(lambda x: tool_clusters_df.loc[x.users_row_id].end.max(), axis=1)
    
    # find number of users involved in this detected super cluster
    all_clusters_df['user_count'] = all_clusters_df.apply(lambda x: len(tool_clusters_df.loc[x.users_row_id].user.unique()), axis=1)
    
    # find the average coordinate
    try:
        # avoid a DASK bug
        all_clusters_df['mean_lat'] = all_clusters_df.apply(lambda x: tool_clusters_df.loc[x.users_row_id].lat.mean(), axis=1)
        all_clusters_df['mean_lon'] = all_clusters_df.apply(lambda x: tool_clusters_df.loc[x.users_row_id].lon.mean(), axis=1)
    except:
        all_clusters_df['mean_lat'] = all_clusters_df.apply(lambda x: None, axis=1)
        all_clusters_df['mean_lon'] = all_clusters_df.apply(lambda x: None, axis=1)
        
    all_clusters_df['lat_lon'] = all_clusters_df.apply(lambda x: list(zip(tool_clusters_df.loc[x.users_row_id].lat.values, tool_clusters_df.loc[x.users_row_id].lon.values)), axis=1)
    
    return all_clusters_df

In [46]:
cluster_output_candidate

Unnamed: 0,user,tool,start,end,ip,lon,lat,cluster,scanned_date,user_meet_class_size_min
168,samiaalam94,nanowire,2018-04-03,2018-04-07,119.148.44.254,90.375,23.7,0,2018-04-06,True
169,iftakhar42,nanowire,2018-04-06,2018-04-10,103.204.244.6,90.4277,23.7418,0,2018-04-06,True
170,samratewu,nanowire,2018-04-04,2018-04-10,103.86.109.173,90.4109,23.7908,0,2018-04-06,True
172,saikatsaha95,nanowire,2018-04-06,2018-04-10,43.230.120.201,90.4279,23.7525,0,2018-04-06,True
173,faisal.ewu,nanowire,2018-04-06,2018-04-10,103.204.244.6,90.4277,23.7418,0,2018-04-06,True
...,...,...,...,...,...,...,...,...,...,...
92290,rutbeer.991,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True
92291,zayan1998,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True
92293,tokchinkuan,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True
92297,doominous1,pntoy,2018-04-27,2018-05-01,103.18.0.34,101.703,3.1698,1,2018-05-01,True


In [47]:
cluster_output_nodup = detected_clusters_df.drop_duplicates(subset=['scanned_date', 'cluster', 'user','tool'])

passed_cutoff = cluster_output_nodup[['scanned_date','cluster','tool','user']]
passed_cutoff = passed_cutoff.groupby(['scanned_date','cluster','tool']).count()['user'] > inparams[0].class_size_min

cluster_output_candidate = cluster_output_nodup.join(passed_cutoff, on=['scanned_date', 'cluster', 'tool'], rsuffix='_meet_class_size_min')
cluster_output_candidate['user_meet_class_size_min'].fillna(False, inplace=True)
cluster_output_candidate = cluster_output_candidate[cluster_output_candidate.user_meet_class_size_min]

logging.info('Geospatially clustered candidates for classrooms on each day:')
logging.info('(cluster_output_candidate)')
logging.info(cluster_output_candidate)

# Aggregate clusters in neighboring days into one
meta=pd.DataFrame(columns=['mean_lat', 'end', 'lat_lon', 'mean_lon', 'start', 'user_count', 'users_row_id'])
meta.mean_lat = meta.mean_lat.astype(np.float16)
meta.mean_lon = meta.mean_lon.astype(np.float16)
meta.user_count = meta.user_count.astype(np.int16)
meta.users_row_id = meta.users_row_id.astype(np.int16)
meta.start = meta.start.astype(np.datetime64)
meta.end = meta.end.astype(np.datetime64)

ddata = dd.from_pandas(cluster_output_candidate, npartitions=60) \
          .groupby('tool').apply(form_cluster_blocks) \
          .compute(scheduler=inparams[0].dask_scheduler)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result


[########################################] | 100% Completed |  2.0s


In [48]:
ddata

Unnamed: 0_level_0,Unnamed: 1_level_0,users_row_id,start,end,user_count,mean_lat,mean_lon,lat_lon
tool,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
jmoltool,0,"Int64Index([47246, 47247, 47248, 47249, 47250,...",2018-01-10,2018-04-15,37,40.432242,-86.918733,"[(40.4444, -86.9256), (40.4251, -86.9147), (40..."
nanocmos,0,"Int64Index([21343, 21346, 21347, 21348, 21350,...",2018-01-14,2018-01-21,26,18.677025,73.685394,"[(18.6187, 73.8037), (18.6187, 73.8037), (18.6..."
nanocmos,1,"Int64Index([21374, 21375, 21379, 21381, 21383,...",2018-01-14,2018-01-31,34,19.072800,72.882600,"[(19.0728, 72.8826), (19.0728, 72.8826), (19.0..."
nanocmos,2,"Int64Index([21611, 21612, 21619, 21624, 21627,...",2018-01-23,2018-02-03,33,35.988207,-78.904238,"[(35.9968, -78.8955), (35.9968, -78.8955), (36..."
nanocmos,3,"Int64Index([21969, 21971, 21973, 21974, 21975,...",2018-03-07,2018-03-16,7,37.521077,126.934190,"[(37.5112, 126.9741), (37.5112, 126.9741), (37..."
...,...,...,...,...,...,...,...,...
pcpbt,1,"Int64Index([75532, 75533, 75534, 75535, 75536,...",2018-03-12,2018-03-21,10,40.679552,-73.975732,"[(40.598, -73.9467), (40.7359, -73.9904), (40...."
uvspec,0,"Int64Index([50264, 50265, 50267, 50268, 50269,...",2018-03-24,2018-04-23,35,35.761107,-78.702163,"[(35.7633, -78.71), (35.7633, -78.71), (35.763..."
deformnanowire,0,"Int64Index([46757, 46761, 46765, 46768, 46769,...",2018-02-10,2018-02-19,6,39.965300,-83.023500,"[(39.9653, -83.0235), (39.9653, -83.0235), (39..."
deformnanowire,1,"Int64Index([46930, 46931, 46935, 46936, 46939,...",2018-02-24,2018-03-03,11,41.808000,-72.251000,"[(41.808, -72.251), (41.808, -72.251), (41.808..."


In [24]:
meta.dtypes

user                         object
tool                         object
start                        object
end                          object
ip                           object
lon                         float16
lat                          object
cluster                      object
scanned_date                 object
user_meet_class_size_min     object
dtype: object