In [1]:
## build researcher events, student events, and self-study events
## researcher are simply those with NH citations

In [2]:
import sys

import pandas as pd
import time
import datetime

In [3]:
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
nanohub_metrics_db = application.new_db_engine('nanohub_metrics')
wang159_myrmekes_db = application.new_db_engine('rfm_data')

salesforce = application.new_salesforce_engine()
db_s = salesforce

[1mnanoHUB - Serving Students, Researchers & Instructors[0m
Obtained Salesforce access token ...... True


## Cumulative classroom/research/self-study

In [5]:
###
# 1. pull all unique classroom users from SF
# 2. pull all research users from DB2
# 3. pull all contacts from SF
# 4. iteratively pull toolstart df again by year starting from 2000

In [6]:
## pull the list of contacts within each cluster
contacts_in_cluster_df = db_s.query_data('Select Contact__c, Tool_Usage_Cluster__c from ContactToolClusterAssociation__c')

[Success] Bulk job creation successful. Job ID = 7505w00000cyXFBAA2
{"id":"7505w00000cyXFBAA2","operation":"query","object":"ContactToolClusterAssociation__c","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-31T04:15:52.000+0000","systemModstamp":"2022-01-31T04:15:52.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000cyXFBAA2","operation":"query","object":"ContactToolClusterAssociation__c","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-31T04:15:52.000+0000","systemModstamp":"2022-01-31T04:15:53.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":1900,"retries":0,"totalProcessingTime":270}
[Success] Bulk job completed successfully.


In [7]:
display(contacts_in_cluster_df.head(2))
print(contacts_in_cluster_df.shape)

Unnamed: 0,Contact__c,Tool_Usage_Cluster__c
0,0035w000034JLEEAA4,a0w5w00000A8OFoAAN
1,0035w000034K074AAC,a0w5w000009Q7m8AAC


(1900, 2)


In [8]:
import os
cwd = os.getcwd()
print(cwd)

/home/saxenap/nanoHUB/nanoHUB/pipeline/plotting


In [9]:
print(contacts_in_cluster_df.shape[0])

1900


In [10]:
#if contacts_in_cluster_df.shape[0] > 9000:
contacts_in_cluster_df.to_csv(cwd+"/cached_data/backup_contacts_in_clusters.csv")
#else:
#    raise ValueError("salesforce connection issue with clusters, please reload or setup some limits")

In [11]:
unique_contacts_in_clusters = set(contacts_in_cluster_df['Contact__c'].to_list())
print(len(unique_contacts_in_clusters))

1680


In [12]:
contacts_df = db_s.query_data('Select Id, nanoHUB_user_ID__c from Contact')

[Success] Bulk job creation successful. Job ID = 7505w00000cyXFGAA2
{"id":"7505w00000cyXFGAA2","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-31T04:16:04.000+0000","systemModstamp":"2022-01-31T04:16:04.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000cyXFGAA2","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-31T04:16:04.000+0000","systemModstamp":"2022-01-31T04:16:04.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":19815,"retries":0,"totalProcessingTime":1006}
{"id":"7505w00000cyXFGAA2","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-31T0

In [13]:
display(contacts_df.head(2))
print(contacts_df.shape)

Unnamed: 0,Id,nanoHUB_user_ID__c
0,0035w000031Vsp1AAC,998.0
1,0035w000031Vsp2AAC,1683.0


(252095, 2)


In [14]:
## determine self-study users, i.e., unclassified users
# if sf id is in contacts_in_cluster_df['Contact__c'], then it is self-study

self_study = []
clustered = []

sf_contact_ids = contacts_df['Id'].to_list()

import numpy as np
# bool_search = np.in1d(np.array(list(unique_contacts_in_clusters)),np.array(sf_contact_ids))
bool_search = np.in1d(np.array(sf_contact_ids),np.array(list(unique_contacts_in_clusters)))

clustered = np.where(bool_search)[0].tolist()
self_study = np.where(~bool_search)[0].tolist()

In [15]:
clustered_NH_ids = contacts_df['nanoHUB_user_ID__c'][clustered]
self_study_NH_ids = contacts_df['nanoHUB_user_ID__c'][self_study]

In [16]:
print(clustered_NH_ids.shape)
print(self_study_NH_ids.shape)

(1680,)
(250415,)


In [17]:
## need to filter self-study-nh-ids with the researchers

In [18]:
## This researcher determination is based entirely on the results of research authors on nanohub
## i.e., we use the researcher information on db2

In [19]:
sql_query = 'select id, uid, author from jos_citations'
jos_citations = pd.read_sql_query(sql_query, nanohub_db)
display(jos_citations.head(2))
print(jos_citations.shape)

Unnamed: 0,id,uid,author
0,10000001,5568,"Weber, Bent; Mahapatra, Suddhasatta; Ryu, Hoon..."
1,10000002,5568,"Andrawis, Robert; Bermeo, Jose; Charles, James..."


(4193, 3)


In [20]:
t1_authors = jos_citations['author'].to_list()
t1_authors2 = [] #[j.split(';') for i,j in enumerate(t1_authors)]

In [21]:
for i,j in enumerate(t1_authors):
    try:
        holder = j.split(';')
        if len(holder) > 1: 
            for k,l in enumerate(holder):
                if l[0] == ' ':
                    holder[k] = l[1:]
        t1_authors2.append(holder)
    except:
        garb = True

In [22]:
t1_authors3 = [item for sublist in t1_authors2 for item in sublist]
t1_authors3 = set(t1_authors3)
print(len(t1_authors3))

6829


In [23]:
## extract all researchers with nh ids
rw_ids = []
need_search = []
for i,j in enumerate(t1_authors3):
    if '{{' in j:
        rw_ids.append(i)
    else:
        need_search.append(i)

In [24]:
## from the rw_ids, extract the nanohub userids
# can obtain usernames from the DB2 table
rw_ids2 = np.array(list(t1_authors3))[rw_ids]
for i,j in enumerate(rw_ids2):
    begin = j.index('{{')
    rw_ids2[i] = j[begin+2:-2]
print(len(rw_ids2))

2079


In [25]:
# need to obtain usernames
sql_query = "select id,name,username from jos_users where id in "+str(tuple(rw_ids2))
rw_researchers = pd.read_sql_query(sql_query,nanohub_db)
display(rw_researchers.head(2))

Unnamed: 0,id,name,username
0,1742,H.-S. Philip Wong,hspwong
1,1767,Osama Munir Nayfeh,onayfeh


In [26]:
## need to obtain the username/ids from those people without nh ids explicitly spec'd in jos_citations
ns2 = np.array(list(t1_authors3))[need_search]
ns2 = ns2[1:]

In [27]:
tuple(ns2)[:5]

('Sunhee Lee',
 'M.S. Wartak',
 'Yudith Cardinale',
 'A. Moholkar',
 'Michael J. McLennan')

In [28]:
sql_query = "select id,name,username from jos_users where name in "+str(tuple(ns2))
ns2_researchers = pd.read_sql_query(sql_query,nanohub_db)
display(ns2_researchers.head(2))

Unnamed: 0,id,name,username
0,282167,Aarthi Narayanan,aarthin0102
1,280280,Abani Patra,apatra01


In [29]:
## combine the usernames into one data structure
researcher_usernames = rw_researchers['username'].to_list() + ns2_researchers['username'].to_list()
print(len(researcher_usernames))

2988


In [30]:
researcher_track = []

In [31]:
today = datetime.datetime.today()
print(today)

2022-01-31 04:16:30.783588


In [32]:
## caching check 
try:
    user_breakdown_df = pd.read_csv(cwd+'/cached_data/nh_user_breakdown.csv')
    print(user_breakdown_df)
    cache_flag = True
except:
    cache_flag = False

     Unnamed: 0  year_month  clustered_track  self_track  researcher_track  \
0             0  2000-01-01                0           0                 0   
1             1  2000-02-01                0           0                 0   
2             2  2000-03-01               19          38                 8   
3             3  2000-04-01               28          43                11   
4             4  2000-05-01               35          52                13   
..          ...         ...              ...         ...               ...   
255         255  2021-04-01             8965       14793               270   
256         256  2021-05-01             8522       14856               259   
257         257  2021-06-01             7837       15230               253   
258         258  2021-07-01             7512       15069               255   
259         259  2021-08-01             7381       15225               254   

     clustered_track_per  researcher_track_per  self_track_per 

In [60]:
## determine starting year
#if cache_flag == False:
start_year_base = 1999 #2000
#else:
#     print('check the saved file')
#    start_year_base = int(user_breakdown_df['year_month'].to_list()[-1][:4])
    
    

In [61]:
print(user_breakdown_df['year_month'].to_list()[-1][:4])

2021


In [62]:
from copy import deepcopy

In [63]:
## computer has memory limits, so split toolstart into branches
# this filters and finds the research usage in nanohub
start_year = deepcopy(start_year_base)
while start_year < today.year+1:
    start_year += 1
    end_date = r"'"+str(start_year)+r"-01-01'"
    start_date = r"'"+str(start_year-2)+r"-01-01'"
    
    for i in range(1,13):
        if i < 10:
            start_date = r"'"+str(start_year-1)+r"-0"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-0"+str(i)+r"-01'"
        else:
            start_date = r"'"+str(start_year-1)+r"-"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-"+str(i)+r"-01'"

        sql_query_researcher = "select user from toolstart where user in " + str(tuple(researcher_usernames))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        researcher_users = pd.read_sql_query(sql_query_researcher, nanohub_metrics_db)

        researcher_track.append(researcher_users.drop_duplicates().shape[0])

    print('start year: '+ str(start_year))

start year: 2000
start year: 2001
start year: 2002
start year: 2003
start year: 2004
start year: 2005
start year: 2006
start year: 2007
start year: 2008
start year: 2009
start year: 2010
start year: 2011
start year: 2012
start year: 2013
start year: 2014
start year: 2015
start year: 2016
start year: 2017
start year: 2018
start year: 2019
start year: 2020
start year: 2021
start year: 2022
start year: 2023


In [64]:
## filter the self study folks against those of researchers
# need to pull the researcher sf ids

In [65]:
researcher_ids = rw_researchers['id'].to_list() + ns2_researchers['id'].to_list()
print(len(researcher_ids))

2988


In [66]:
bool_search2 = np.in1d(np.array(self_study_NH_ids.to_list()),np.array(researcher_ids))

In [67]:
# Note, researchers can possibly be doubled counted within clusters!!
# sf_researchers = np.where(bool_search2)[0].tolist() 
sf_self_study = np.where(~bool_search2)[0].tolist()

In [68]:
print(len(sf_self_study))
print(len(self_study_NH_ids.to_list()))

247645
250415


In [69]:
sf_contact_ids = contacts_df['nanoHUB_user_ID__c'][sf_self_study]

In [70]:
sf_contact_ids = sf_contact_ids.reset_index()
sf_contact_ids = sf_contact_ids.drop(columns='index')
display(sf_contact_ids)

Unnamed: 0,nanoHUB_user_ID__c
0,998.0
1,1683.0
2,1684.0
3,1685.0
4,1686.0
...,...
247640,343655.0
247641,343656.0
247642,343657.0
247643,343658.0


In [71]:
tuple(sf_contact_ids['nanoHUB_user_ID__c'])[:5]

(998.0, 1683.0, 1684.0, 1685.0, 1686.0)

In [72]:
tuple(self_study_NH_ids.dropna().to_list())[:5]

(998.0, 1683.0, 1684.0, 1685.0, 1686.0)

In [73]:
## loading in toolstart

In [74]:
# clustered user usernames
tc_query = 'select id,username from jos_users where id in ' +str(tuple(clustered_NH_ids))
clustered_NH_username = pd.read_sql_query(tc_query,nanohub_db)

In [75]:
# self study user usernames
ts_query = 'select id,username from jos_users where id in ' + str(tuple(self_study_NH_ids.dropna().to_list())) 
#sf_contact_ids['nanoHUB_user_ID__c']))
ss_NH_username = pd.read_sql_query(ts_query,nanohub_db)

In [76]:
## cache the researchers, self-study, and clustered users
cache_r_nhid = pd.DataFrame()
cache_r_nhid['nhid'] = researcher_ids

cache_ss_nhid = pd.DataFrame()
cache_ss_nhid['nhid'] = ss_NH_username['id']
cache_ss_nhid['username'] = ss_NH_username['username']

cache_c_nhid = pd.DataFrame()
cache_c_nhid['nhid'] = clustered_NH_username['id']
cache_c_nhid['username'] = clustered_NH_username['username']

In [77]:
print(cache_r_nhid.head(2))
print(cache_ss_nhid.head(2))
print(cache_c_nhid.head(2))

   nhid
0  1742
1  1767
     nhid username
0  211547  -100005
1  211574  -100024
   nhid username
0  3013      mmc
1  3482    gekco


In [78]:
cache_r_nhid.to_csv(cwd+'/cached_data/researcher_ids.csv')
cache_ss_nhid.to_csv(cwd+'/cached_data/ss_ids.csv')
cache_c_nhid.to_csv(cwd+'/cached_data/class_ids.csv')

In [79]:
# clustered_NH_username2;researcher_NH_username2;self_NH_username2
clustered_track = []
self_track = []

# clustered_track_per = []
# self_track_per = []
# researcher_track_per = []

In [80]:
ss_NH_username.shape #old

(249541, 2)

In [None]:
## computer has memory limits, so split toolstart into branches
start_year = deepcopy(start_year_base)

while start_year < today.year+1:
    start_year += 1
    
    for i in range(1,13):
        if i < 10:
            start_date = r"'"+str(start_year-1)+r"-0"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-0"+str(i)+r"-01'"
        else:
            start_date = r"'"+str(start_year-1)+r"-"+str(i)+r"-01'"
            end_date = r"'"+str(start_year)+r"-"+str(i)+r"-01'"
        
#         sql_query_clustered = "select user from toolstart where user in " + str(tuple(clustered_NH_username2['username']))\
#             +" and datetime <= "+end_date+" and datetime >= "+start_date
        sql_query_clustered = "select user from toolstart where user in " + str(tuple(clustered_NH_username['username']))\
            +" and datetime <= "+end_date+" and datetime >= "+start_date
        clustered_users = pd.read_sql_query(sql_query_clustered, nanohub_metrics_db)
        
#         sql_query_self = "select user from toolstart where user in " + str(tuple(self_NH_username2['username']))\
#             +" and user != 'instanton' and user != 'gridstat' and datetime <= "+end_date+" and datetime >= "+start_date
        sql_query_self = "select user from toolstart where user in " + str(tuple(ss_NH_username['username']))\
            +" and user != 'instanton' and user != 'gridstat' and datetime <= "+end_date+" and datetime >= "+start_date
        self_users = pd.read_sql_query(sql_query_self, nanohub_metrics_db) #engine_metrics #.drop_duplicates()
        
        clustered_track.append(clustered_users.drop_duplicates().shape[0])
        self_track.append(self_users.drop_duplicates().shape[0])
#         print('start_date: '+start_date)
#         print('end_date: '+end_date)  
    print('start year: '+str(start_year))

start year: 2000
start year: 2001
start year: 2002


In [None]:
all_months_list = list()
for this_year in range(start_year_base+1,today.year+2):
    if cache_flag == True:
        for this_month in range(1,today.month+1):
            all_months_list.append(datetime.datetime(this_year, this_month, 1))
    else:
        for this_month in range(1,12+1):
            all_months_list.append(datetime.datetime(this_year, this_month, 1))

In [None]:
print(len(clustered_track))
print(len(all_months_list))
print(len(researcher_track))

In [None]:
#print(user_breakdown_df2.head())

In [None]:

if cache_flag == True:
    user_breakdown_df2 = pd.DataFrame()
    user_breakdown_df2['year_month'] = all_months_list
    user_breakdown_df2['clustered_track'] = clustered_track
    user_breakdown_df2['self_track'] = self_track
    user_breakdown_df2['researcher_track'] = researcher_track
    
    user_breakdown_df = pd.concat([user_breakdown_df, user_breakdown_df2], ignore_index=True)
    display(user_breakdown_df.head(2))
else:
    user_breakdown_df = pd.DataFrame()
    if today.month == 12:
        user_breakdown_df['year_month'] = all_months_list #months_bin['year_month'].to_list()[13:]
        user_breakdown_df['clustered_track'] = clustered_track#[:len(all_months_list)]
        user_breakdown_df['self_track'] = self_track#[:len(all_months_list)]
        user_breakdown_df['researcher_track'] = researcher_track#[:len(all_months_list)]        
    else:
        user_breakdown_df['year_month'] = all_months_list #-(12-today.month)
        user_breakdown_df['clustered_track'] = clustered_track#[:len(all_months_list[:])]
        user_breakdown_df['self_track'] = self_track#[:len(all_months_list[:])]
        user_breakdown_df['researcher_track'] = researcher_track#[:len(all_months_list[:])]
    display(user_breakdown_df.head(2))

In [None]:
# user_breakdown_df.to_csv(cwd+'/cached_data/temp.csv')

In [None]:
import re

In [None]:
## read the dev_flags.txt to determine where to save
with open(cwd+'/dev_flags.txt','r') as f:
    raw_flags = f.read()
flag_pattern = re.compile(r'= \w+')
flags1 = re.findall(flag_pattern,raw_flags)    
flags_result = [i[2:] for i in flags1]
dev_flag = flags_result[0]
prod_flag = flags_result[1]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
sns.set_style("whitegrid")

plt.figure(figsize=(9,6))
# user_breakdown_df2 = user_breakdown_df.iloc[:-(12-today.month),:]
user_breakdown_df2 = user_breakdown_df.iloc[:-12-(12-today.month),:]
plt.stackplot(user_breakdown_df2.year_month, \
#               user_breakdown_df2[['clustered_track','self_track']].to_numpy().T, \
#              labels=['Classroom', 'Unclassified'], \
#              colors=['green', 'orange']); #Self-study              
              user_breakdown_df2[['clustered_track', 'researcher_track','self_track']].to_numpy().T, \
             labels=['Classroom','Research', 'Unclassified'], \
             colors=['green', 'red', 'orange']); #Self-study

plt.legend(loc='upper left')#,fontsize=14);
plt.ylabel('12-month Trailing Total')#,fontsize=15)
plt.xlim([datetime.datetime(2001,1,1), datetime.datetime(2021,7,1)])#,fontsize=15)
# plt.show()

if dev_flag == 'True':
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months.eps', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months.pdf', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months.png', dpi=1000, bbox_inches='tight')
elif prod_flag == 'True':
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months.eps', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months.pdf', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months.png', dpi=1000, bbox_inches='tight')
else:
    print('no printing flag is active')

In [None]:
user_breakdown_df2['clustered_track_per'] = 100*np.divide(user_breakdown_df2['clustered_track'],\
            user_breakdown_df2['clustered_track'].to_numpy()+user_breakdown_df2['researcher_track'].to_numpy()+\
                        user_breakdown_df2['self_track'].to_numpy())
user_breakdown_df2['researcher_track_per'] = 100*np.divide(user_breakdown_df2['researcher_track'],\
            user_breakdown_df2['clustered_track'].to_numpy()+user_breakdown_df2['researcher_track'].to_numpy()+\
                        user_breakdown_df2['self_track'].to_numpy())
user_breakdown_df2['self_track_per'] = 100*np.divide(user_breakdown_df2['self_track'],\
            user_breakdown_df2['clustered_track'].to_numpy()+user_breakdown_df2['researcher_track'].to_numpy()+\
                        user_breakdown_df2['self_track'].to_numpy())

In [None]:
plt.figure(figsize=(9,6))
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.clustered_track_per, label='classroom', \
             color='green');
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.researcher_track_per, label='research', \
             color='red');
plt.plot(user_breakdown_df2.year_month, user_breakdown_df2.self_track_per, label='self-study', \
             color='orange');

plt.ylim([0, 100])
plt.legend(loc='upper right');
plt.ylabel('Percentage (%)');
plt.xlim([datetime.datetime(2001,1,1), datetime.datetime(2021,7,1)])
# plt.show()

if dev_flag == 'True':
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months_percent.eps', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months_percent.pdf', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_local/simusers_class_res_unclass_12months_percent.png', dpi=1000, bbox_inches='tight')
elif prod_flag == 'True':
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months_percent.eps', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months_percent.pdf', dpi=1000, bbox_inches='tight')
    plt.savefig(cwd+'/plots_production/simusers_class_res_unclass_12months_percent.png', dpi=1000, bbox_inches='tight')
else:
    print('no printing flag is active')

In [None]:
user_breakdown_df2.to_csv(cwd+'/cached_data/nh_user_breakdown.csv')