In [1]:
import pandas as pd
from pprint import pprint
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper
from ast import literal_eval

# logger.debug('Testing')
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

s3_client = get_default_s3_client(application)
raw_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_raw)
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)
# file_path = 'tool_users_map.csv'
file_path = 'user_tools.csv'

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
sql_query = ''' 
SELECT
  toolstart.user,
  GROUP_CONCAT(DISTINCT tool_versions.toolname SEPARATOR ',') as names_tools
FROM nanohub_metrics.toolstart toolstart
       INNER JOIN nanohub.jos_tool_version tool_versions
                  ON tool_versions.instance = toolstart.tool
WHERE
      toolstart.user IS NOT NULL
#       AND TRIM(toolstart.user) != ''
#       AND (DATE(datetime) BETWEEN '2021-02-22' AND '2022-02-21')
GROUP BY toolstart.user;
'''

tool_users_df = pd.read_sql_query(sql_query, nanohub_db)
display(tool_users_df.head())
display(tool_users_df.tail())

Unnamed: 0,user,names_tools
0,,
1,0.yao.yuan,"mosfetsat,mosfet"
2,007sribabu,deviceelectron
3,008dilip,abacus
4,00ff,bmcsuite


Unnamed: 0,user,names_tools
165879,zzzwmhq,s4sim
165880,z_depth,"nanomos,moscv"
165881,Z_E_U_S,"cenems,bandstrlab"
165882,z_henry35,"nsoptics,cndo"
165883,z_sath,"fermi,nanomos"


In [3]:
processed_mapper.save_as_csv(tool_users_df, file_path, index=None)

In [4]:
df = processed_mapper.read(file_path, converters={"clusters": literal_eval}, low_memory=False)
df = df[df['user'].notna()]
df = df[df['names_tools'].notna()]
display(df)

Unnamed: 0,user,names_tools
1,0.yao.yuan,"mosfetsat,mosfet"
2,007sribabu,deviceelectron
3,008dilip,abacus
4,00ff,bmcsuite
5,00mhking00,padre
...,...,...
165879,zzzwmhq,s4sim
165880,z_depth,"nanomos,moscv"
165881,Z_E_U_S,"cenems,bandstrlab"
165882,z_henry35,"nsoptics,cndo"


In [5]:
df['names_tools'] = df['names_tools'].str.split(',')
display(df)

Unnamed: 0,user,names_tools
1,0.yao.yuan,"[mosfetsat, mosfet]"
2,007sribabu,[deviceelectron]
3,008dilip,[abacus]
4,00ff,[bmcsuite]
5,00mhking00,[padre]
...,...,...
165879,zzzwmhq,[s4sim]
165880,z_depth,"[nanomos, moscv]"
165881,Z_E_U_S,"[cenems, bandstrlab]"
165882,z_henry35,"[nsoptics, cndo]"


In [6]:
df_exploded = df.explode('names_tools')
display(df_exploded)

Unnamed: 0,user,names_tools
1,0.yao.yuan,mosfetsat
1,0.yao.yuan,mosfet
2,007sribabu,deviceelectron
3,008dilip,abacus
4,00ff,bmcsuite
...,...,...
165881,Z_E_U_S,bandstrlab
165882,z_henry35,nsoptics
165882,z_henry35,cndo
165883,z_sath,fermi


In [7]:
df1 = df_exploded.groupby('names_tools')['user'].apply(list).reset_index(name='users')
display(df1)

Unnamed: 0,names_tools,users
0,,"[antoniocumbreraconde123, bghill, dkearney, ka..."
1,1dchainmd,"[3511alofi, 7and7var7, a.schleife, aa469635295..."
2,1dfdmht,"[1alejandrolopez11, a.abid, aadarwis, aadhitya..."
3,1dfs,[gw014425]
4,1dhetero,"[16.nt.gopi.kundia, 18navid, 1vineetkumarsingh..."
...,...,...
1182,zeno,"[ab.kairy09, ajaygollapudi1951, badrelarhrib19..."
1183,zooleyipnb1,"[jerinannie1996, rajeshkrajan14, zooley]"
1184,zooleyjupyter,[zooley]
1185,zooleylinux,[zooley]
