In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper
import os
cwd = os.getcwd()

# logger.debug('Testing')
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

s3_client = get_default_s3_client(application)
raw_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_raw)
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)

derived_data_file_path = 'derived_data_for_users.csv'



[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
all_users_df = processed_mapper.read(derived_data_file_path)
all_users_df['id'] = all_users_df.id.astype(int)
#all_users_df['is_researcher'] = False
display(all_users_df.tail())

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid
272686,356675,Ra Abd,eyad2020,rabab.r3ab@gmail.com,2022-02-12 00:28:49,2022-02-12 00:29:39,0.0,
272687,356676,Pede John Villadares Garzon,john1973,pedejohngarzon@gmail.com,2022-02-12 00:45:10,2022-02-12 00:45:11,0.0,
272688,356677,Busra Ozdemir,-193184,-193184@invalid,2022-02-12 00:58:08,2022-02-12 00:58:08,0.0,
272689,356679,Mahdi Salari,mahdisalari82020,mahdisalari82020@gmail.com,2022-02-12 02:18:29,2022-02-12 02:18:30,0.0,
272690,356680,王鸣雁,-193187,-193187@invalid,2022-02-12 02:27:35,2022-02-12 02:27:36,0.0,


In [3]:
# get user profile details
sql_query = '''
SELECT user_id, profile_key FROM jos_user_profiles
WHERE profile_key in ('researcherid', 'googlescholar', 'orcid', 'researchgateid', 'scopusid')
'''
profile_df = pd.read_sql_query(sql_query, nanohub_db)



display(len(profile_df))
display(profile_df.head())
display(profile_df.tail())

5082

Unnamed: 0,user_id,profile_key
0,4994,orcid
1,9239,orcid
2,22539,orcid
3,25392,orcid
4,14811,orcid


Unnamed: 0,user_id,profile_key
5077,356599,googlescholar
5078,356604,googlescholar
5079,356657,scopusid
5080,356657,googlescholar
5081,356657,orcid


In [4]:
display(profile_df.profile_key.unique())

array(['orcid', 'googlescholar', 'researcherid', 'researchgateid',
       'scopusid'], dtype=object)

In [5]:
profile_df = profile_df.groupby(['user_id'], as_index=False)['profile_key'].agg(list)

In [6]:
display(profile_df)

Unnamed: 0,user_id,profile_key
0,2579,[orcid]
1,2862,"[googlescholar, researchgateid]"
2,3159,[orcid]
3,3482,"[orcid, googlescholar, researcherid, researchg..."
4,3583,"[orcid, googlescholar]"
...,...,...
4032,356574,[googlescholar]
4033,356596,"[googlescholar, researcherid, scopusid, resear..."
4034,356599,[googlescholar]
4035,356604,[googlescholar]


In [7]:
profile_df['has_researcher_profile'] = True
profile_df['user_id'] = profile_df['user_id'].astype(int)
display(profile_df[profile_df["user_id"] == 2862])

all_users_df = all_users_df.merge(profile_df, how='left', left_on='id', right_on='user_id')
display(all_users_df[all_users_df["id"] == 2862])

all_users_df['has_researcher_profile'].fillna(False,inplace=True)
all_users_df['profile_key'] = all_users_df['profile_key'].apply(lambda d: d if isinstance(d, list) else [])
all_users_df = all_users_df.drop(columns='user_id')

Unnamed: 0,user_id,profile_key,has_researcher_profile
1,2862,"[googlescholar, researchgateid]",True


Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid,user_id,profile_key,has_researcher_profile
1176,2862,Mark Lundstrom,lundstro,lundstro@purdue.edu,2004-12-06 22:01:25,2022-02-08 16:28:36,2345.0,2862.0,2862.0,"[googlescholar, researchgateid]",True


In [8]:
display(len(all_users_df[all_users_df["profile_key"].str.len() == 0]))
display(len(all_users_df[all_users_df["profile_key"].str.len() == 1]))
display(all_users_df[all_users_df["id"] == 2579])
display(all_users_df[all_users_df['id'] == 998])

268654

3445

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid,profile_key,has_researcher_profile
895,2579,Stefan Birner,birner,stefan.birner@nextnano.com,2006-05-05 07:36:09,2020-01-13 15:58:33,52.0,2579.0,[orcid],True


Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid,profile_key,has_researcher_profile
0,998,hubrepo hubrepo,hubrepo,nkissebe@gmail.com,2014-11-13 21:09:09,,0.0,,[],False


In [9]:
processed_mapper.save_as_csv(all_users_df, derived_data_file_path, index=None)

test_df = processed_mapper.read(derived_data_file_path)
display(test_df)

Unnamed: 0,id,name,username,email,registerDate,lastvisitDate,number_simulations,uid,profile_key,has_researcher_profile
0,998,hubrepo hubrepo,hubrepo,nkissebe@gmail.com,2014-11-13 21:09:09,,0.0,,[],False
1,1000,CMS Admin Manager,admin,support@nanohub.org,,,0.0,,[],False
2,1683,nanoHUB support,support,support@nanohub.org,2008-11-19 22:51:04,2008-11-19 23:55:30,2.0,1683.0,[],False
3,1684,Grid Statistics,gridstat,gridstat@nanohub.org,2008-11-18 17:29:56,2020-02-14 18:50:14,0.0,,[],False
4,1685,NCN NCN,ncn,ncn@nanohub.org,2008-11-11 19:17:04,,0.0,,[],False
...,...,...,...,...,...,...,...,...,...,...
272686,356675,Ra Abd,eyad2020,rabab.r3ab@gmail.com,2022-02-12 00:28:49,2022-02-12 00:29:39,0.0,,[],False
272687,356676,Pede John Villadares Garzon,john1973,pedejohngarzon@gmail.com,2022-02-12 00:45:10,2022-02-12 00:45:11,0.0,,[],False
272688,356677,Busra Ozdemir,-193184,-193184@invalid,2022-02-12 00:58:08,2022-02-12 00:58:08,0.0,,[],False
272689,356679,Mahdi Salari,mahdisalari82020,mahdisalari82020@gmail.com,2022-02-12 02:18:29,2022-02-12 02:18:30,0.0,,[],False


In [10]:
display("# of users with citations > 1 = %d" % len(test_df[test_df['number_citations'] > 1]))

KeyError: 'number_citations'