# EDA: Sychrony detection for classroom activities

File: 'core_classroom_analysis.py'

Checkpoint: 1

## Load data

In [1]:
pkl_filepath = '/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/core_classroom_analysis_cp1.pkl'

In [2]:
from pprint import pprint
import pandas as pd
import numpy as np
import logging
import datetime

from dask import dataframe as dd
from dask.multiprocessing import get
from dask.diagnostics import ProgressBar
pbar = ProgressBar()
pbar.register()

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

In [3]:
# Load checkpoint shelve
import pickle
with open(pkl_filepath, 'rb') as f:
    inparams = pickle.load(f)
    
class_cluster_candidate = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_class_cluster_candidate.pkl')
user_activity_blocks_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_user_activity_blocks_df.pkl')
jos_users = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_jos_users.pkl')
toolrun_df = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_toolrun_df.pkl')
cluster_output_candidate = pd.read_pickle('/home/wang159/nanoHUB/projects/online_users_ts_analysis/temp/cp1_cluster_output_candidate.pkl')


## Geospatial clustering

In [4]:
# great circle distance
def haversine_metric(x,y):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """

    # convert decimal degrees to radians 
    # lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    lon1 = np.radians(x[0])
    lat1 = np.radians(x[1])
    lon2 = np.radians(y[0])
    lat2 = np.radians(y[1])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371 # Radius of earth in miles. Use 6371 for kilometers
    return c * r

def haversine_affinity(X):
    return pairwise_distances(X, metric=haversine_metric)

In [5]:
def geospatial_cluster(cluster_input, cluster_size_cutoff, class_distance_threshold):
    """
    Given individual user's activity blocks for all users, all days, use geospatial clustering
    to form clusters with certrain intra-cluster distance limit.
    """
    date_earliest = cluster_input.start.min()
    date_latest =  cluster_input.end.max()
    logging.info('Date range: '+str(date_earliest)+' - '+str(date_latest))

    cluster_output = list()
    cluster_input['cluster']=None
    cluster_input['scanned_date']=datetime.datetime(1900,1,1)

    cluster_output_np = np.empty((0,len(cluster_input.columns)))

    for this_date in [date_earliest + datetime.timedelta(days=n) for n in range(0, int((date_latest-date_earliest).days+1))]:

        # for each date spanned by cluster_input
        
        this_date_all_blocks = cluster_input[(cluster_input.start<=this_date) & (cluster_input.end>=this_date)]

        this_date_all_tools = this_date_all_blocks.tool.unique()
        
        for this_tool in this_date_all_tools:
            this_date_cluster_input = this_date_all_blocks[this_date_all_blocks.tool == this_tool]
            
            # if sample too small, skip
            if len(this_date_cluster_input.index) < cluster_size_cutoff:
                continue
            
            # number of users for this tool is large enough, globally, to warrent a geospatial clustering
            this_clustering = AgglomerativeClustering(affinity=haversine_affinity, \
                                                 linkage='average', \
                                                 n_clusters = None, \
                                                 distance_threshold = class_distance_threshold \
                                                ).fit(this_date_cluster_input[['lon','lat']].values)

            cluster_input.loc[this_date_cluster_input.index, 'cluster'] = this_clustering.labels_
            cluster_input.loc[this_date_cluster_input.index, 'scanned_date'] = this_date

            # add this tool run's scanned_date, tool, user into dict
            cluster_output_np = np.append(cluster_output_np, cluster_input.loc[this_date_cluster_input.index].to_numpy(), axis=0)


    return cluster_output_np


In [6]:
user_activity_blocks_df

Unnamed: 0,user,tool,start,end,ip,lon,lat,cluster,scanned_date
0,a.belapurkar,sugar,2018-02-12,2018-02-16,27.251.227.210,77.0000,20.0000,0,2018-02-16
1,abhijaypandey,sugar,2018-01-16,2018-01-20,115.111.136.245,72.9635,19.1970,0,2018-01-20
2,abrana,crystal_viewer,2018-04-24,2018-04-28,45.116.232.0,67.0817,24.9043,0,2018-04-28
3,abrana,crystal_viewer,2018-04-17,2018-04-22,45.116.232.14,67.0817,24.9043,0,2018-04-22
4,adpatel8,dda,2018-02-24,2018-03-01,128.174.44.68,-88.2062,40.1047,0,2018-03-01
...,...,...,...,...,...,...,...,...,...
23085,tejan,sugarchevron,2018-01-21,2018-01-25,115.111.136.245,72.9635,19.1970,0,2018-01-25
23086,tony.russo,photonicsdb,2018-02-17,2018-02-21,129.59.122.11,-86.7920,36.1515,0,2018-02-21
23087,vitany,nsoptics,2018-01-17,2018-01-21,150.135.165.59,-110.9488,32.2346,2,2018-01-21
23088,xiaomengyi,comphydrogennb,2018-01-13,2018-01-18,109.123.82.198,-0.1224,51.4964,1,2018-01-14


In [7]:
#cluster_output_np = geospatial_cluster(user_activity_blocks_df, inparams.class_size_min, inparams.class_distance_threshold)
ddata = dd.from_pandas(user_activity_blocks_df, npartitions=200) \
        .groupby('tool')\
        .apply(geospatial_cluster, \
               cluster_size_cutoff=inparams[0].class_size_min, \
               class_distance_threshold=inparams[0].class_distance_threshold) \
        .compute(scheduler=inparams[0].dask_scheduler)

  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  


[########################################] | 100% Completed | 17.6s


In [8]:
ddata_np = ddata.to_numpy()
#for this_index in range(0, ddata.shape[0]):
#    final_np = np.append(final_np, ddata_np[this_index], axis=0)
    
final_np = np.vstack(ddata_np)

In [9]:
pd.DataFrame(final_np, columns=user_activity_blocks_df.columns)

Unnamed: 0,user,tool,start,end,ip,lon,lat,cluster,scanned_date
0,raidiouf,mif,2018-04-08,2018-04-12,131.128.51.82,-71.5292,41.4803,4,2018-04-11
1,juan.escrig,mif,2018-04-10,2018-04-14,158.170.132.204,-70.6653,-33.4513,5,2018-04-11
2,sagaraelch,mif,2018-04-07,2018-04-11,138.75.214.232,103.849,1.2929,2,2018-04-11
3,kcii,mif,2018-04-11,2018-04-16,167.99.173.13,-121.975,37.3417,3,2018-04-11
4,ashlesha,mif,2018-04-11,2018-04-15,103.21.127.81,72.8826,19.0728,1,2018-04-11
...,...,...,...,...,...,...,...,...,...
92582,huisun,stretchfcc,2018-03-07,2018-03-11,18.111.26.31,-97.822,37.751,2,2018-03-10
92583,walaaj3,stretchfcc,2018-03-06,2018-03-10,193.54.246.77,-4.4839,48.3877,3,2018-03-10
92584,laris,stretchfcc,2018-03-07,2018-03-11,18.189.58.240,-83.0235,39.9653,0,2018-03-10
92585,mkitcher,stretchfcc,2018-03-07,2018-03-11,18.189.23.176,-83.0235,39.9653,0,2018-03-10
