In [1]:
import pandas as pd
from pprint import pprint
from nanoHUB.application import Application
from nanoHUB.configuration import ClusteringConfiguration
from nanoHUB.pipeline.geddes.data import get_default_s3_client
from nanoHUB.dataaccess.lake import S3FileMapper


# logger.debug('Testing')
application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

s3_client = get_default_s3_client(application)
raw_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_raw)
processed_mapper = S3FileMapper(s3_client, ClusteringConfiguration().bucket_name_processed)
# file_path = 'user_tools_map.csv'
file_path = 'tool_users_map.csv'



[1mnanoHUB - Serving Students, Researchers & Instructors[0m


In [2]:
sql_query = ''' 
SELECT
  tool_versions.toolname,
  GROUP_CONCAT(DISTINCT toolstart.user SEPARATOR ', ') as names_users
FROM nanohub.jos_tool_version tool_versions
       INNER JOIN nanohub_metrics.toolstart toolstart
                  ON tool_versions.instance = toolstart.tool
GROUP BY tool_versions.toolname;
'''

tool_users_df = pd.read_sql_query(sql_query, nanohub_db)
display(tool_users_df.head())
display(tool_users_df.tail())

Unnamed: 0,toolname,names_users
0,,"kapadia,"
1,1dchainmd,"thomas0915, khaan, baratunde, brayanfdv, saj02..."
2,1dfdmht,"cheraghchi, suphatk, bgmdiffe, mahna, gysun, s..."
3,1dfs,"gw014425, dkearney"
4,1dhetero,"samarthagarwal, Vasileska, dkearney, gekco, jo..."


Unnamed: 0,toolname,names_users
1174,zeno,"taaseenkhanelah, nzuckman, daudus, clarksm, de..."
1175,zooleyipnb1,"zooley, clarksm, jerinannie1996, rajeshkrajan14"
1176,zooleyjupyter,zooley
1177,zooleylinux,zooley
1178,zooleylinux1,"zooley, clarksm"


In [3]:
processed_mapper.save_as_csv(tool_users_df, file_path, index=None)

In [4]:
df = processed_mapper.read(file_path)
df = df[df['toolname'].notna()]
df = df[df['names_users'].notna()]
display(df)

Unnamed: 0,toolname,names_users
1,1dchainmd,"thomas0915, khaan, baratunde, brayanfdv, saj02..."
2,1dfdmht,"cheraghchi, suphatk, bgmdiffe, mahna, gysun, s..."
3,1dfs,"gw014425, dkearney"
4,1dhetero,"samarthagarwal, Vasileska, dkearney, gekco, jo..."
5,1dmd,"deshpan5, strachan, clarksm"
...,...,...
1174,zeno,"taaseenkhanelah, nzuckman, daudus, clarksm, de..."
1175,zooleyipnb1,"zooley, clarksm, jerinannie1996, rajeshkrajan14"
1176,zooleyjupyter,zooley
1177,zooleylinux,zooley


In [5]:
df['names_users'] = df['names_users'].str.split(',')
display(df)

Unnamed: 0,toolname,names_users
1,1dchainmd,"[thomas0915, khaan, baratunde, brayanfdv, ..."
2,1dfdmht,"[cheraghchi, suphatk, bgmdiffe, mahna, gys..."
3,1dfs,"[gw014425, dkearney]"
4,1dhetero,"[samarthagarwal, Vasileska, dkearney, gekco..."
5,1dmd,"[deshpan5, strachan, clarksm]"
...,...,...
1174,zeno,"[taaseenkhanelah, nzuckman, daudus, clarksm..."
1175,zooleyipnb1,"[zooley, clarksm, jerinannie1996, rajeshkra..."
1176,zooleyjupyter,[zooley]
1177,zooleylinux,[zooley]
