In [1]:
## build researcher events, student events, and self-study events
## researcher are simply those with NH citations

In [7]:
import sys

import pandas as pd
import numpy as np
import time
import datetime
import os
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper

from nanoHUB.clustering.infra import get_mike_only_clustered_users, get_xufeng_only_clustered_users, get_mike_xufeng_clustered_users
from nanoHUB.clustering.infra import create_clusters_repository, add_cluster_info, get_all_clustered_users, get_all_self_study_users, get_all_self_identified_users

In [4]:
pd.set_option('display.max_columns', None)
cwd = os.getcwd()

In [5]:
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
nanohub_metrics_db = application.new_db_engine('nanohub_metrics')
wang159_myrmekes_db = application.new_db_engine('rfm_data')

s3_client = get_default_s3_client(application)
derived_data_file_path = 'derived_data_for_users.csv'
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)

## Cumulative classroom/research/self-study

In [6]:
clustered_df = get_all_clustered_users(processed_mapper, derived_data_file_path)
clustered_df['nanoHUB_user_ID__c'] = clustered_df['id']
clustered_NH_ids = clustered_df['nanoHUB_user_ID__c']
display(len(clustered_NH_ids))

78167

In [None]:
mike_clustered_users_df = get_mike_only_clustered_users(processed_mapper, derived_data_file_path)
mike_clustered_users_df['nanoHUB_user_ID__c'] = mike_clustered_users_df['id']
mike_clustered_NH_ids = mike_clustered_users_df['nanoHUB_user_ID__c']
display(len(mike_clustered_users_df))

In [None]:
xufeng_clustered_users_df = get_xufeng_only_clustered_users(processed_mapper, derived_data_file_path)
xufeng_clustered_users_df['nanoHUB_user_ID__c'] = xufeng_clustered_users_df['id']
xufeng_clustered_NH_ids = xufeng_clustered_users_df['nanoHUB_user_ID__c']
display(len(xufeng_clustered_users_df))

In [None]:
mike_xufeng_clustered_users_df = get_mike_xufeng_clustered_users(processed_mapper, derived_data_file_path)
mike_xufeng_clustered_users_df['nanoHUB_user_ID__c'] = mike_xufeng_clustered_users_df['id']
mike_xufeng_clustered_NH_ids = mike_xufeng_clustered_users_df['nanoHUB_user_ID__c']
display(len(mike_xufeng_clustered_users_df))

In [None]:
# raise

In [None]:
self_study_df = get_all_self_study_users(processed_mapper, derived_data_file_path)
self_study_df['nanoHUB_user_ID__c'] = self_study_df['id']
self_study_NH_ids = self_study_df['nanoHUB_user_ID__c']
self_study_usernames = self_study_df['username'].tolist()

print('self_study_NH_ids length = ',len(self_study_NH_ids))
print('self_study_usernames length = ',len(self_study_usernames))
print('self_study_usernames samples = ',self_study_usernames[0:10])

In [None]:
self_identified_df = get_all_self_identified_users(processed_mapper, derived_data_file_path)
self_identified_df['nanoHUB_user_ID__c'] = self_identified_df['id']
self_identified_NH_ids = self_identified_df['nanoHUB_user_ID__c']
self_identified_usernames = self_identified_df['username'].tolist()
print('length of self_identified_NH_ids',len(self_identified_NH_ids))
display(self_identified_df.head())
#print(self_identified_usernames)



In [None]:
allusers_df = processed_mapper.read(derived_data_file_path)
display(allusers_df.head(5))
#display(allusers_usernames)
display(allusers_df.loc[allusers_df['username'].isnull()])
#display(allusers_df.loc[allusers_df['username'].notna()])

allusers_df = allusers_df.loc[allusers_df['username'].notna()]
display(allusers_df.loc[allusers_df['username'].isnull()])

allusers_usernames = allusers_df['username'].tolist()


In [None]:
print('clustered users',clustered_NH_ids.shape)
print('selfstudy users',self_study_NH_ids.shape)
print('self_study_usernames',len(self_study_usernames))
print('self identified researchers',self_identified_NH_ids.shape)
print('all users',allusers_df.shape)

In [None]:
frames = [clustered_df, self_study_df]
contacts_df = pd.concat(frames)

In [None]:
## need to filter self-study-nh-ids with the researchers

In [None]:
## This researcher determination is based entirely on the results of research authors on nanohub
## i.e., we use the researcher information on db2

In [None]:
sql_query = 'select id, uid, author from jos_citations'
jos_citations = pd.read_sql_query(sql_query, nanohub_db)
display(jos_citations.head(2))
print(jos_citations.shape)

In [None]:
t1_authors = jos_citations['author'].to_list()
t1_authors2 = [] #[j.split(';') for i,j in enumerate(t1_authors)]

In [None]:
for i,j in enumerate(t1_authors):
    try:
        holder = j.split(';')
        if len(holder) > 1: 
            for k,l in enumerate(holder):
                if l[0] == ' ':
                    holder[k] = l[1:]
        t1_authors2.append(holder)
    except:
        garb = True

In [None]:
t1_authors32 = [item for sublist in t1_authors2 for item in sublist]
t1_authors3 = set(t1_authors32)
print(len(t1_authors3))

In [None]:
## extract all researchers with nh ids
rw_ids = []
need_search = []
for i,j in enumerate(t1_authors3):
    if '{{' in j:
        rw_ids.append(i)
    else:
        need_search.append(i)

In [None]:
## from the rw_ids, extract the nanohub userids
# can obtain usernames from the DB2 table
rw_ids2 = np.array(list(t1_authors3))[rw_ids]
for i,j in enumerate(rw_ids2):
    begin = j.index('{{')
    rw_ids2[i] = j[begin+2:-2]
print(len(rw_ids2))

In [None]:
# need to obtain usernames
sql_query = "select id,name,username from jos_users where id in "+str(tuple(rw_ids2))
rw_researchers = pd.read_sql_query(sql_query,nanohub_db)
display(rw_researchers.head(2))

In [None]:
## need to obtain the username/ids from those people without nh ids explicitly spec'd in jos_citations
ns2 = np.array(list(t1_authors3))[need_search]
ns2 = ns2[1:]

In [None]:
tuple(ns2)[:5]

In [None]:
sql_query = "select id,name,username from jos_users where name in "+str(tuple(ns2))
ns2_researchers = pd.read_sql_query(sql_query,nanohub_db)
display(ns2_researchers.head(2))

In [None]:
## combine the usernames into one data structure
researcher_usernames = rw_researchers['username'].to_list() + ns2_researchers['username'].to_list()
print('list of researchers who have cited nanohub  researcher_usernames=',len(researcher_usernames))

In [None]:
#bool_search3 = np.in1d(np.array(researcher_usernames),np.array(self_identified_usernames))
#researcher_usernames = np.where(~bool_search3)[0].tolist()
#display(len(researcher_usernames))

#display(len(self_identified_usernames))

print('self_identified_usernames can have overlap with researchers',len(self_identified_usernames))
#exclude the self-identified researchers from the users who have done research on nanohub
#display(self_identified_usernames)
#display(researcher_usernames)
print('self_identified_usernames patial list', self_identified_usernames[0:10])
print('researcher_usernames partial list',researcher_usernames[0:10])

#bool_search3 = np.in1d(np.array(self_identified_usernames),np.array(researcher_usernames))
#print('bool_search3 length of ',len(bool_search3))
#print('bool_search3 partial list',bool_search3)

#self_identified_usernames_index = np.where(~bool_search3)[0].tolist()
si_usernames = self_identified_usernames
r_usernames = researcher_usernames


print(len(si_usernames))
si_usernames = [x for x in si_usernames if x not in r_usernames]
print(len(si_usernames))
print(si_usernames[:10])

In [None]:
self_identified_usernames = si_usernames


#print('self_identified_usernames_index partial list',self_identified_usernames_index[0:10])
#print('self_identified_usernames_index length',len(self_identified_usernames_index))


#display(self_identified_df)
#display(self_identified_df['username'][self_identified_usernames_index])


print('researcher_usernames',len(researcher_usernames))

print('self_identified_usernames reduced by overlap with researchers',len(self_identified_usernames))
print('self_identified_usernames some samples',self_identified_usernames[0:10])

In [None]:
#need to take out the researchers and citing authors from the list of selfstudy users. 
#
print('number of self-study users self_study_usernames',len(self_study_usernames))
print('take out len(researcher_usernames)',len(researcher_usernames))
print('take out len(self_identified_usernames)',len(self_identified_usernames))

self_study_usernames = [x for x in self_study_usernames if x not in researcher_usernames]
print('number of self-study users self_study_usernames',len(self_study_usernames))
self_study_usernames = [x for x in self_study_usernames if x not in self_identified_usernames]
print('number of self-study users self_study_usernames',len(self_study_usernames))

In [None]:
# researcher_ids = rw_researchers['id'].to_list() + ns2_researchers['id'].to_list()
# print('researchers/citation',len(researcher_ids))



# bool_search2 = np.in1d(np.array(self_study_NH_ids.tolist()),np.array(researcher_ids))


# # Note, researchers can possibly be doubled counted within clusters!!
# # sf_researchers = np.where(bool_search2)[0].tolist() 
# sf_self_study = np.where(~bool_search2)[0].tolist()
# print('self-study users',len(sf_self_study))

# bool_search2_2 = np.in1d(np.array(sf_self_study),np.array(self_identified_NH_ids.tolist()))
# sf_self_study = np.where(~bool_search2_2)[0].tolist()

# print('sf_self_study',len(sf_self_study))
# print('self_study_NH_ids',len(self_study_NH_ids.to_list()))
# print('self_identified_NH_ids',len(self_identified_NH_ids.to_list()))


# # self study user usernames
# ts_query = 'select id,username from jos_users where id in ' + str(tuple(self_study_NH_ids.dropna().to_list())) 
# #sf_contact_ids['nanoHUB_user_ID__c']))
# ss_NH_username = pd.read_sql_query(ts_query,nanohub_db)
# ss_NH_username_usernames = ss_NH_username['username'].tolist()

In [None]:
#restart analysis here 
#restart analysis here 
#restart analysis here 
#restart analysis here 




In [None]:
today = datetime.datetime.today()
print(today)

In [None]:
## caching check 
try:
    user_breakdown_df = pd.read_csv(cwd+'/cached_data/nh_user_breakdown.csv')
    print(user_breakdown_df)
    cache_flag = True
except:
    cache_flag = False

In [None]:
## determine starting year
#if cache_flag == False:
start_year_base = 1999 #2000
#else:
#     print('check the saved file')
#    start_year_base = int(user_breakdown_df['year_month'].to_list()[-1][:4])
    
    

In [None]:
#print(user_breakdown_df['year_month'].to_list()[-1][:4])

In [None]:
from copy import deepcopy

In [None]:
all_months_list = list()
for this_year in range(start_year_base+1,today.year+2):
    if cache_flag == True:
        for this_month in range(1,today.month+1):
            all_months_list.append(datetime.datetime(this_year, this_month, 1))
    else:
        for this_month in range(1,12+1):
            all_months_list.append(datetime.datetime(this_year, this_month, 1))

cache_all_month_list = pd.DataFrame()
cache_all_month_list['Date'] = all_months_list
cache_all_month_list.to_csv(cwd+'/cached_data/all_months_list.csv')
cache_all_month_list.to_csv(cwd+'/cached_data/'+ str(start_year_base)+'_all_months_list.csv')
cache_all_month_list = []
print('allmonths partial list',all_months_list[0:36])


In [None]:
print(self_identified_usernames[0:50])
#print('str(tuple(self_identified_usernames))',str(tuple(self_identified_usernames)))
print(self_study_usernames[0:50])
print(allusers_usernames[0:50])


In [None]:
#cache_r_track = pd.DataFrame()
#cache_si_track = pd.DataFrame()
#cache_si_track['user'] = self_identified_track
self_identified_track = []
researcher_track = []
self2_track = []
simtotal_track = []

## computer has memory limits, so split toolstart into branches
# this filters and finds the research usage in nanohub
start_year = deepcopy(start_year_base)
while start_year < today.year+1:
    start_year += 1
    print('start year: '+ str(start_year))
    end_date = r"'"+str(start_year)+r"-01-01'"
    start_date = r"'"+str(start_year-2)+r"-01-01'"
    
    for i in range(1,13):
        if i < 10:
            start_date = r"'"+str(start_year-1)+r"-0"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-0"+str(i)+r"-01'"
        else:
            start_date = r"'"+str(start_year-1)+r"-"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-"+str(i)+r"-01'"

        sql_query_researcher = "select user from toolstart where user in " + str(tuple(researcher_usernames))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        researcher_users = pd.read_sql_query(sql_query_researcher, nanohub_metrics_db)
        #print('date'+str(start_date)+ 'research_users partial',researcher_users.drop_duplicates()[0:10])
        #print('date'+str(start_date)+ 'research_users =',len(researcher_users.drop_duplicates()))
        
        #researcher_track contains the number of users in this specific month 
        researcher_track.append(researcher_users.drop_duplicates().shape[0])

        sql_query_self_identified = "select user from toolstart where user in " + str(tuple(self_identified_usernames))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        self_identified_users = pd.read_sql_query(sql_query_self_identified, nanohub_metrics_db)
        #print('date'+str(start_date)+ 'self_identified_users partial',self_identified_users.drop_duplicates()[0:10])
        #print('date'+str(start_date)+ 'self_identified_users =',len(self_identified_users.drop_duplicates()))
        
        #self_identified_track contains the number of users in this specific month 
        self_identified_track.append(self_identified_users.drop_duplicates().shape[0])
        #print('start date',start_date)
        #display(researcher_track)
        
        
        sql_query_self = "select user from toolstart where user in " + str(tuple(self_study_usernames))\
            +" and user != 'instanton' and user != 'gridstat' and datetime <= "+end_date+" and datetime >= "+start_date
        self2_users = pd.read_sql_query(sql_query_self, nanohub_metrics_db) #engine_metrics #.drop_duplicates()
        #print('date'+str(start_date)+ 'self2_users partial',self2_users.drop_duplicates()[0:10])
        #print('date'+str(start_date)+ 'self2_users =',len(self2_users.drop_duplicates()))
        self2_track.append(self2_users.drop_duplicates().shape[0])        
     
        # query for any job start activity and therefore not examin all registered users.
        sql_query_simtotal = "select user from toolstart where datetime <= "+end_date+" and datetime >= "+start_date
        simtotal_users = pd.read_sql_query(sql_query_simtotal, nanohub_metrics_db) #engine_metrics #.drop_duplicates()
        #print('date'+str(start_date)+ 'simtotal_users partial',simtotal_users.drop_duplicates()[0:10])
        #print('date'+str(start_date)+ 'simtotal_users =',len(simtotal_users.drop_duplicates()))
        simtotal_track.append(simtotal_users.drop_duplicates().shape[0])        
        
        print('date'+str(start_date)+ 'total =',len(simtotal_users.drop_duplicates()), \
             'self2_users =',len(self2_users.drop_duplicates()), \
             'researcher_users =',len(researcher_users.drop_duplicates()), \
             'self_identified_users =',len(self_identified_users.drop_duplicates())     )
        #print('simtotal_track=',simtotal_track)
        #print('self2_track =',self2_track)
        #print('researcher_track',researcher_track)
        

    print('start year: '+ str(start_year))
    #display(researcher_track)
    #display(researcher_track)
    
    cache_r_track = pd.DataFrame()
    cache_r_track['track'] = researcher_track
    cache_r_track.to_csv(cwd+'/cached_data/researcher_track.csv')
    cache_r_track.to_csv(cwd+'/cached_data/'+ str(start_year)+'_researcher_track.csv')
    cache_r_track = []
    
    cache_si_track = pd.DataFrame()
    cache_si_track['track'] = self_identified_track
    cache_si_track.to_csv(cwd+'/cached_data/self_identified_track.csv')
    cache_si_track.to_csv(cwd+'/cached_data/'+ str(start_year)+'_self_identified_track.csv')
    cache_si_track = []
    
    cache_s2_track = pd.DataFrame()
    cache_s2_track['track'] = self2_track
    cache_s2_track.to_csv(cwd+'/cached_data/self2_track.csv')
    cache_s2_track.to_csv(cwd+'/cached_data/'+ str(start_year)+'_self2_track.csv')
    cache_s2_track = []
    
    cache_simtotal_track = pd.DataFrame()
    cache_simtotal_track['track'] = simtotal_track
    cache_simtotal_track.to_csv(cwd+'/cached_data/simtotal_track.csv')
    cache_simtotal_track.to_csv(cwd+'/cached_data/'+ str(start_year)+'_simtotal_track.csv')
    cache_simtotal_track = []
    
    

In [None]:
## filter the self study folks against those of researchers
# need to pull the researcher sf ids

In [None]:
sf_contact_ids = contacts_df['nanoHUB_user_ID__c'][sf_self_study]

In [None]:
sf_contact_ids = sf_contact_ids.reset_index()
sf_contact_ids = sf_contact_ids.drop(columns='index')
display(sf_contact_ids)

In [None]:
tuple(sf_contact_ids['nanoHUB_user_ID__c'])[:5]

In [None]:
tuple(self_study_NH_ids.dropna().to_list())[:5]

In [None]:
## loading in toolstart

In [None]:
# clustered user usernames
tc_query = 'select id,username from jos_users where id in ' +str(tuple(clustered_NH_ids))
clustered_NH_username = pd.read_sql_query(tc_query,nanohub_db)
display(len(clustered_NH_username))

tc_query = 'select id,username from jos_users where id in ' +str(tuple(mike_clustered_NH_ids))
mike_clustered_NH_username = pd.read_sql_query(tc_query,nanohub_db)
display(len(mike_clustered_NH_username))

tc_query = 'select id,username from jos_users where id in ' +str(tuple(xufeng_clustered_NH_ids))
xufeng_clustered_NH_username = pd.read_sql_query(tc_query,nanohub_db)
display(len(xufeng_clustered_NH_username))

tc_query = 'select id,username from jos_users where id in ' +str(tuple(mike_xufeng_clustered_NH_ids))
mike_xufeng_clustered_NH_username = pd.read_sql_query(tc_query,nanohub_db)
display(len(mike_xufeng_clustered_NH_username))



In [None]:
## self study user usernames
#ts_query = 'select id,username from jos_users where id in ' + str(tuple(self_study_NH_ids.dropna().to_list())) 
##sf_contact_ids['nanoHUB_user_ID__c']))
#ss_NH_username = pd.read_sql_query(ts_query,nanohub_db)

In [None]:
## cache the researchers, self-study, and clustered users
cache_r_nhid = pd.DataFrame()
cache_r_nhid['nhid'] = researcher_ids

cache_si_nhid = pd.DataFrame()
cache_si_nhid['nhid'] = self_identified_NH_ids.to_list()

cache_ss_nhid = pd.DataFrame()
cache_ss_nhid['nhid'] = ss_NH_username['id']
cache_ss_nhid['username'] = ss_NH_username['username']

cache_c_nhid = pd.DataFrame()
cache_c_nhid['nhid'] = clustered_NH_username['id']
cache_c_nhid['username'] = clustered_NH_username['username']

cache_cm_nhid = pd.DataFrame()
cache_cm_nhid['nhid'] = mike_clustered_NH_username['id']
cache_cm_nhid['username'] = mike_clustered_NH_username['username']

cache_cx_nhid = pd.DataFrame()
cache_cx_nhid['nhid'] = xufeng_clustered_NH_username['id']
cache_cx_nhid['username'] = xufeng_clustered_NH_username['username']

cache_cmx_nhid = pd.DataFrame()
cache_cmx_nhid['nhid'] = mike_xufeng_clustered_NH_username['id']
cache_cmx_nhid['username'] = mike_xufeng_clustered_NH_username['username']

In [None]:
print(cache_r_nhid.head(2))
print(cache_si_nhid.head(2))
print(cache_ss_nhid.head(2))
print(cache_c_nhid.head(2))
print(cache_cm_nhid.head(2))
print(cache_cx_nhid.head(2))
print(cache_cmx_nhid.head(2))

In [None]:
cache_r_nhid.to_csv(cwd+'/cached_data/researcher_ids.csv')
cache_si_nhid.to_csv(cwd+'/cached_data/self_identified_ids.csv')
cache_ss_nhid.to_csv(cwd+'/cached_data/ss_ids.csv')
cache_c_nhid.to_csv(cwd+'/cached_data/class_ids.csv')
cache_cm_nhid.to_csv(cwd+'/cached_data/class_m_ids.csv')
cache_cx_nhid.to_csv(cwd+'/cached_data/class_x_ids.csv')
cache_cmx_nhid.to_csv(cwd+'/cached_data/class_mx_ids.csv')

In [None]:
# clustered_NH_username2;researcher_NH_username2;self_NH_username2
clustered_track = []
clustered_m_track = []
clustered_x_track = []
clustered_mx_track = []
self_track = []

# clustered_track_per = []
# self_track_per = []
# researcher_track_per = []

In [None]:
ss_NH_username.shape #old

In [None]:
## computer has memory limits, so split toolstart into branches
start_year = deepcopy(start_year_base)

while start_year < today.year+1:
    start_year += 1
    
    for i in range(1,13):
        if i < 10:
            start_date = r"'"+str(start_year-1)+r"-0"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-0"+str(i)+r"-01'"
        else:
            start_date = r"'"+str(start_year-1)+r"-"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-"+str(i)+r"-01'"
        
#         sql_query_clustered = "select user from toolstart where user in " + str(tuple(clustered_NH_username2['username']))\
#             +" and datetime <= "+end_date+" and datetime >= "+start_date
        sql_query_clustered = "select user from toolstart where user in " + str(tuple(clustered_NH_username['username']))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        clustered_users = pd.read_sql_query(sql_query_clustered, nanohub_metrics_db)
        
        sql_query_m_clustered = "select user from toolstart where user in " + str(tuple(mike_clustered_NH_username['username']))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        clustered_m_users = pd.read_sql_query(sql_query_m_clustered, nanohub_metrics_db)
        
        sql_query_x_clustered = "select user from toolstart where user in " + str(tuple(xufeng_clustered_NH_username['username']))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        clustered_x_users = pd.read_sql_query(sql_query_x_clustered, nanohub_metrics_db)
        
        sql_query_mx_clustered = "select user from toolstart where user in " + str(tuple(mike_xufeng_clustered_NH_username['username']))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        clustered_mx_users = pd.read_sql_query(sql_query_mx_clustered, nanohub_metrics_db)
        
        
#         sql_query_self = "select user from toolstart where user in " + str(tuple(self_NH_username2['username']))\
#             +" and user != 'instanton' and user != 'gridstat' and datetime <= "+end_date+" and datetime >= "+start_date
        sql_query_self = "select user from toolstart where user in " + str(tuple(self_study_usernames))\
            +" and user != 'instanton' and user != 'gridstat' and datetime <= "+end_date+" and datetime >= "+start_date
        self_users = pd.read_sql_query(sql_query_self, nanohub_metrics_db) #engine_metrics #.drop_duplicates()
        
        clustered_track.append(clustered_users.drop_duplicates().shape[0])
        clustered_m_track.append(clustered_m_users.drop_duplicates().shape[0])
        clustered_x_track.append(clustered_x_users.drop_duplicates().shape[0])
        clustered_mx_track.append(clustered_mx_users.drop_duplicates().shape[0])
        self_track.append(self_users.drop_duplicates().shape[0])
#         print('start_date: '+start_date)
#         print('end_date: '+end_date)  
    print('start year: '+str(start_year))
    cache_clustered_track = pd.DataFrame()
    cache_clustered_track['track'] = clustered_track
    cache_clustered_track.to_csv(cwd+'/cached_data/clustered_track.csv')
    cache_clustered_track = []
    
    cache_clustered_m_track = pd.DataFrame()
    cache_clustered_m_track['track'] = clustered_m_track
    cache_clustered_m_track.to_csv(cwd+'/cached_data/clustered_m_track.csv')
    cache_clustered_m_track = []

    cache_clustered_x_track = pd.DataFrame()
    cache_clustered_x_track['track'] = clustered_x_track
    cache_clustered_x_track.to_csv(cwd+'/cached_data/clustered_x_track.csv')
    cache_clustered_x_track = []

    cache_clustered_mx_track = pd.DataFrame()
    cache_clustered_mx_track['track'] = clustered_mx_track
    cache_clustered_mx_track.to_csv(cwd+'/cached_data/clustered_mx_track.csv')
    cache_clustered_mx_track = []

    cache_self_track = pd.DataFrame()
    cache_self_track['track'] = self_track
    cache_self_track.to_csv(cwd+'/cached_data/self_track.csv')
    cache_self_track = []

    

In [None]:
# all_months_list = list()
# for this_year in range(start_year_base+1,today.year+2):
#     if cache_flag == True:
#         for this_month in range(1,today.month+1):
#             all_months_list.append(datetime.datetime(this_year, this_month, 1))
#     else:
#         for this_month in range(1,12+1):
#             all_months_list.append(datetime.datetime(this_year, this_month, 1))

In [None]:
print(len(clustered_track))
print(len(clustered_m_track))
print(len(clustered_x_track))
print(len(clustered_mx_track))
print(len(all_months_list))
print(len(researcher_track))

In [None]:
#print(user_breakdown_df2.head())

In [None]:

if cache_flag == True:
    user_breakdown_df2 = pd.DataFrame()
    user_breakdown_df2['year_month'] = all_months_list
    user_breakdown_df2['clustered_track'] = clustered_track
    user_breakdown_df2['clustered_m_track'] = clustered_m_track
    user_breakdown_df2['clustered_x_track'] = clustered_x_track
    user_breakdown_df2['clustered_mx_track'] = clustered_mx_track
    user_breakdown_df2['self_identified_track'] = self_identified_track
    user_breakdown_df2['self_track'] = self_track
    user_breakdown_df2['researcher_track'] = researcher_track
    
    user_breakdown_df = pd.concat([user_breakdown_df, user_breakdown_df2], ignore_index=True)
    display(user_breakdown_df.head(2))
else:
    user_breakdown_df = pd.DataFrame()
    if today.month == 12:
        user_breakdown_df['year_month'] = all_months_list #months_bin['year_month'].to_list()[13:]
        user_breakdown_df['clustered_track'] = clustered_track#[:len(all_months_list)]
        user_breakdown_df['clustered_m_track'] = clustered_m_track#[:len(all_months_list)]
        user_breakdown_df['clustered_x_track'] = clustered_x_track#[:len(all_months_list)]
        user_breakdown_df['clustered_mx_track'] = clustered_mx_track#[:len(all_months_list)]
        user_breakdown_df['self_identified_track'] = self_identified_track
        user_breakdown_df['self_track'] = self_track#[:len(all_months_list)]
        user_breakdown_df['researcher_track'] = researcher_track#[:len(all_months_list)]        
    else:
        user_breakdown_df['year_month'] = all_months_list #-(12-today.month)
        user_breakdown_df['clustered_track'] = clustered_track#[:len(all_months_list[:])]
        user_breakdown_df['clustered_m_track'] = clustered_m_track#[:len(all_months_list[:])]
        user_breakdown_df['clustered_x_track'] = clustered_x_track#[:len(all_months_list[:])]
        user_breakdown_df['clustered_mx_track'] = clustered_mx_track#[:len(all_months_list[:])]
        user_breakdown_df['self_identified_track'] = self_identified_track
        user_breakdown_df['self_track'] = self_track#[:len(all_months_list[:])]
        user_breakdown_df['researcher_track'] = researcher_track#[:len(all_months_list[:])]
    display(user_breakdown_df.head(2))

In [None]:
# user_breakdown_df.to_csv(cwd+'/cached_data/temp.csv')

In [None]:
import re

In [None]:
## read the dev_flags.txt to determine where to save
with open(cwd+'/dev_flags.txt','r') as f:
    raw_flags = f.read()
flag_pattern = re.compile(r'= \w+')
flags1 = re.findall(flag_pattern,raw_flags)    
flags_result = [i[2:] for i in flags1]
dev_flag = flags_result[0]
prod_flag = flags_result[1]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
sns.set_style("whitegrid")

plt.figure(figsize=(9,6))
# user_breakdown_df2 = user_breakdown_df.iloc[:-(12-today.month),:]
user_breakdown_df2 = user_breakdown_df.iloc[:-12-(12-today.month),:]
plt.stackplot(user_breakdown_df2.year_month, \
#               user_breakdown_df2[['clustered_track','self_track']].to_numpy().T, \
#              labels=['Classroom', 'Unclassified'], \
#              colors=['green', 'orange']); #Self-study              
              user_breakdown_df2[['clustered_m_track', 'clustered_mx_track', 'clustered_x_track', 'researcher_track','self_identified_track','self_track']].to_numpy().T, \
             labels=['Classroom M','Classroom MX','Classroom X','Research', 'Researcher','Unclassified'], \
             colors=['lawngreen','green', 'yellowgreen','red', 'fuchsia','orange']); #Self-study

plt.legend(loc='upper left')#,fontsize=14);
plt.ylabel('12-month Trailing Total')#,fontsize=15)
plt.xlim([datetime.datetime(2001,1,1), datetime.datetime(2021,7,1)])#,fontsize=15)
# plt.show()

if dev_flag == 'True':
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_GK.eps', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_GK.pdf', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_GK.png', dpi=300, bbox_inches='tight')
elif prod_flag == 'True':
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_GK.eps', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_GK.pdf', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_GK.png', dpi=300, bbox_inches='tight')
else:
    print('no printing flag is active')

In [None]:
total = user_breakdown_df2['clustered_track'].to_numpy()+user_breakdown_df2['researcher_track'].to_numpy()+\
                        user_breakdown_df2['self_track'].to_numpy() + user_breakdown_df2['self_identified_track'].to_numpy()
display(len(total))
user_breakdown_df2['clustered_track_per'] = 100*np.divide(user_breakdown_df2['clustered_track'],total)
user_breakdown_df2['self_identified_track_per'] = 100*np.divide(user_breakdown_df2['self_identified_track'], total)
user_breakdown_df2['researcher_track_per'] = 100*np.divide(user_breakdown_df2['researcher_track'], total)
user_breakdown_df2['self_track_per'] = 100*np.divide(user_breakdown_df2['self_track'], total)

In [None]:
plt.figure(figsize=(9,6))
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.clustered_track_per, label='classroom', \
             color='green');
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.researcher_track_per, label='research', \
             color='red');
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.self_identified_track_per, label='self-identified', \
             color='magenta');
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.self_track_per, label='self-study', \
             color='orange');

plt.ylim([0, 100])
plt.legend(loc='upper right');
plt.ylabel('Percentage (%)');
plt.xlim([datetime.datetime(2001,1,1), datetime.datetime(2021,7,1)])
# plt.show()

if dev_flag == 'True':
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_percent_copied.eps', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_percent_copied.pdf', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/300_simusers_class_res_unclass_12months_percent_copied.png', dpi=300, bbox_inches='tight')
elif prod_flag == 'True':
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_percent_copied.eps', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_percent_copied.pdf', dpi=300, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/300_simusers_class_res_unclass_12months_percent_copied.png', dpi=300, bbox_inches='tight')
else:
    print('no printing flag is active')

In [None]:
user_breakdown_df2.to_csv(cwd+'/cached_data/nh_user_breakdown.csv')