In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper


# logger.debug('Testing')
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

#salesforce = application.new_salesforce_engine()

s3_client = get_default_s3_client(application)
raw_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_raw)
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)

derived_data_file_path = 'derived_data_for_users.csv'

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
all_users_df = processed_mapper.read(derived_data_file_path)
all_users_df['id'] = all_users_df.id.astype(int)

In [3]:
sql_query = '''
SELECT COUNT(starts.user) AS number_simulations, users.id AS uid, starts.user
    FROM nanohub_metrics.toolstart AS starts
JOIN nanohub.jos_users AS users
    WHERE users.username = starts.user
GROUP BY starts.user
'''

user_count_df = pd.read_sql_query(sql_query, nanohub_db)

In [4]:
display(len(user_count_df))
display(len(user_count_df[user_count_df['number_simulations'] > 1]))

163559

151751

In [5]:
all_users_df = all_users_df.merge(user_count_df, how='left', left_on='id', right_on='uid')
display(all_users_df)
all_users_df = all_users_df.drop(columns=['user'])
all_users_df['number_simulations'] = all_users_df['number_simulations'].fillna(0)

display(all_users_df.head())

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid,user
0,998,hubrepo hubrepo,hubrepo,nkissebe@gmail.com,2014-11-13 21:09:09,,,,
1,1000,CMS Admin Manager,admin,support@nanohub.org,,,,,
2,1683,nanoHUB support,support,support@nanohub.org,2008-11-19 22:51:04,2008-11-19 23:55:30,2.0,1683.0,support
3,1684,Grid Statistics,gridstat,gridstat@nanohub.org,2008-11-18 17:29:56,2020-02-14 18:50:14,,,
4,1685,NCN NCN,ncn,ncn@nanohub.org,2008-11-11 19:17:04,,,,
...,...,...,...,...,...,...,...,...,...
272686,356675,Ra Abd,eyad2020,rabab.r3ab@gmail.com,2022-02-12 00:28:49,2022-02-12 00:29:39,,,
272687,356676,Pede John Villadares Garzon,john1973,pedejohngarzon@gmail.com,2022-02-12 00:45:10,2022-02-12 00:45:11,,,
272688,356677,Busra Ozdemir,-193184,-193184@invalid,2022-02-12 00:58:08,2022-02-12 00:58:08,,,
272689,356679,Mahdi Salari,mahdisalari82020,mahdisalari82020@gmail.com,2022-02-12 02:18:29,2022-02-12 02:18:30,,,


Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid
0,998,hubrepo hubrepo,hubrepo,nkissebe@gmail.com,2014-11-13 21:09:09,,0.0,
1,1000,CMS Admin Manager,admin,support@nanohub.org,,,0.0,
2,1683,nanoHUB support,support,support@nanohub.org,2008-11-19 22:51:04,2008-11-19 23:55:30,2.0,1683.0
3,1684,Grid Statistics,gridstat,gridstat@nanohub.org,2008-11-18 17:29:56,2020-02-14 18:50:14,0.0,
4,1685,NCN NCN,ncn,ncn@nanohub.org,2008-11-11 19:17:04,,0.0,


In [6]:
processed_mapper.save_as_csv(all_users_df, derived_data_file_path, index=None)

test_df = processed_mapper.read(derived_data_file_path)
display("# of users with simulations > 1 = %d" % len(test_df[test_df['number_simulations'] > 1]))
display(test_df)

'# of users with simulations > 1 = 151751'

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid
0,998,hubrepo hubrepo,hubrepo,nkissebe@gmail.com,2014-11-13 21:09:09,,0.0,
1,1000,CMS Admin Manager,admin,support@nanohub.org,,,0.0,
2,1683,nanoHUB support,support,support@nanohub.org,2008-11-19 22:51:04,2008-11-19 23:55:30,2.0,1683.0
3,1684,Grid Statistics,gridstat,gridstat@nanohub.org,2008-11-18 17:29:56,2020-02-14 18:50:14,0.0,
4,1685,NCN NCN,ncn,ncn@nanohub.org,2008-11-11 19:17:04,,0.0,
...,...,...,...,...,...,...,...,...
272686,356675,Ra Abd,eyad2020,rabab.r3ab@gmail.com,2022-02-12 00:28:49,2022-02-12 00:29:39,0.0,
272687,356676,Pede John Villadares Garzon,john1973,pedejohngarzon@gmail.com,2022-02-12 00:45:10,2022-02-12 00:45:11,0.0,
272688,356677,Busra Ozdemir,-193184,-193184@invalid,2022-02-12 00:58:08,2022-02-12 00:58:08,0.0,
272689,356679,Mahdi Salari,mahdisalari82020,mahdisalari82020@gmail.com,2022-02-12 02:18:29,2022-02-12 02:18:30,0.0,


In [8]:
display("# of users with simulations > 100 = %d" % len(test_df[test_df['number_simulations'] > 100]))
display(test_df[test_df['number_simulations'] > 100])

'# of users with simulations > 100 = 16827'

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid
8,1689,Pedro G. Mireles,mireles,pmireles@gte.net,2000-01-11 17:10:00,,1727.0,1689.0
9,1690,Khaled M. Dadesh,k_dadesh,k_dadesh@yahoo.com,2000-07-05 07:07:09,,292.0,1690.0
15,1696,Haibo Li,haibo,haibo@purdue.edu,2000-09-19 15:06:58,,451.0,1696.0
30,1711,Stephanie Mullins,mullinss,mullinss@shay.ecn.purdue.edu,2001-01-11 00:20:23,,339.0,1711.0
31,1712,Fu-Siong Choo,saiyan,choof@purdue.edu,2001-01-11 04:05:14,,377.0,1712.0
...,...,...,...,...,...,...,...,...
271303,354900,Balu Puthenparampil Ratheesh,bpr5384,bpr5384@psu.edu,2022-01-28 23:42:11,2022-02-01 00:55:30,109.0,354900.0
271362,354989,Fabio Nierhoff,fn2221,fn2221@columbia.edu,2022-01-29 23:01:04,2022-02-04 19:23:55,289.0,354989.0
271379,355010,RIDA ZAINAB,zainab4107741,zainab4107741@cloud.neduet.edu.pk,2022-01-30 10:41:58,2022-01-30 10:41:58,114.0,355010.0
271484,355147,Xinran Liu,xl3063,xl3063@columbia.edu,2022-02-01 01:13:15,2022-02-03 22:28:42,105.0,355147.0
