# DB2-Salesforce connector: Basic tool information updates

In [1]:
# Parameters
api_url = '/services/data/v43.0/sobjects'
external_id = 'Tool_name__c'
object_id = 'nanoHUB_tools__c'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
nanohub_metrics_db = application.new_db_engine('nanohub_metrics')

salesforce = application.new_salesforce_engine()
db_s = salesforce

In [2]:
import pandas as pd
import datetime

## Obtain tool information from DB2 

In [3]:
# Basic tool information
sql_query = "select toolname, title, published, registered from jos_tool"

# display
print(sql_query)

select toolname, title, published, registered from jos_tool


In [4]:
# connect with DB2
# import sqlalchemy as sql

df = pd.read_sql_query(sql_query, nanohub_db, parse_dates=['registered'])
# display
df.head(2)

Unnamed: 0,toolname,title,published,registered
0,ellipsom,title-ellipsom,0,2005-03-05 03:02:47
1,hydrolab,Hydrophobicity Lab,1,2006-02-24 03:38:23


In [5]:
df = df.sort_values(by='registered')
display(df.head(5))
display(df.tail(50))
print(df.shape)

Unnamed: 0,toolname,title,published,registered
348,matlab,Matlab,0,2000-10-02 02:34:37
350,octave,Octave,0,2001-04-04 19:03:01
347,gamess,GAMESS,0,2003-01-15 15:41:22
352,rebo,REBO-1,0,2003-05-08 00:41:33
0,ellipsom,title-ellipsom,0,2005-03-05 03:02:47


Unnamed: 0,toolname,title,published,registered
1704,graphenetools1,Graphene Substrate,0,2020-12-29 23:42:44
1705,random,random tool,0,2021-01-01 00:05:55
1706,htq1,Heat Transfer Quiz Trial 1,0,2021-01-02 15:45:16
1707,gitlocaltool,My nanoHUB simulation tool using gitLocal.,0,2021-01-10 17:49:31
1708,gitlocalrapptur,Example nanoHUB Rappture tool using git local,0,2021-01-12 15:43:00
1709,githubexample,Example nanoHUB Jupyter tool using GitHub,0,2021-01-12 16:02:56
1710,inverter,cmos Inverter,0,2021-01-13 02:10:28
1711,pc4fury,PhysiCell-Fury 3D tumor,0,2021-01-13 11:26:35
1712,qucsngspice,Qucs with SPICE,0,2021-01-15 23:25:03
1713,bsvirdcovidmodo,Modeling a COVID-19 Pandemic Using a BSVIRD Model,0,2021-01-18 19:40:46


(1716, 4)


In [6]:
# debugging
print(df['toolname'].to_list().index('citrinednn'))
print(df.iloc[df['toolname'].to_list().index('citrinednn'),:])

1578
toolname                                citrinednn
title         Hands-on Deep Learning for Materials
published                                        1
registered                     2020-05-27 14:49:06
Name: 1617, dtype: object


In [7]:
'mldefect' in df['toolname'].to_list()

True

In [8]:
# Basic tool information
sql_query = "select * from jos_tool_version_alias"

df_jtva = pd.read_sql_query(sql_query, nanohub_db)

# display
df_jtva.head(2)

Unnamed: 0,tool_version_id,alias
0,977,2ds
1,978,abinit


In [9]:
'mldefect' in df_jtva['alias'].to_list()

False

## Building new SQL queries that roll up instances

In [10]:
# pull all tools in DB2
sql_tools = 'select toolname, instance from nanohub.jos_tool_version;'
tools_df = pd.read_sql_query(sql_tools,nanohub_db)
display(tools_df.head(5))
print(tools_df.shape)

Unnamed: 0,toolname,instance
0,,
1,112016a,112016a_dev
2,1dbtetransient,1dbtetransient_dev
3,1dchainmd,1dchainmd_dev
4,1dchainmd,1dchainmd_r13


(4896, 2)


In [11]:
'mldefect' in tools_df['toolname'].to_list()

True

In [12]:
## debugging for Tanya
# jos_resource_stats_clusters
ql = 'select toolname from jos_resource_stats_clusters;'
df_ql = pd.read_sql_query(ql,nanohub_db)
'mldefect' in df_ql['toolname'].to_list()


False

In [13]:
df_ql.head(5)

Unnamed: 0,toolname
0,1dchainmd
1,1dchainmd
2,1dchainmd
3,1dchainmd
4,1dchainmd


In [14]:
# jos_tool_authors
ql = 'select toolname from jos_tool_authors;'
df_ql = pd.read_sql_query(ql,nanohub_db)
'mldefect' in df_ql['toolname'].to_list()



True

In [15]:
df_ql.head(5)

Unnamed: 0,toolname
0,1dchainmd
1,1dchainmd
2,1dchainmd
3,1dchainmd
4,1dfdmht


In [16]:
## nanohub_metrics trials
# metrics_tool_version
ql_metrics = 'select toolname from nanohub_metrics.metrics_tool_version;'
mdf = pd.read_sql_query(ql_metrics,nanohub_metrics_db)

'mldefect' in mdf['toolname'].to_list()

False

In [17]:
display(mdf.head(5))
display(mdf.tail(5))
print(mdf.shape)

Unnamed: 0,toolname
0,1dchainmd
1,1dchainmd
2,1dfdmht
3,1dfs
4,1dhetero


Unnamed: 0,toolname
1867,xfig
1868,xhub
1869,xterm
1870,xterm
1871,yambo


(1872, 1)


In [18]:
mdf['toolname'].to_list()[730:750]

['mimi',
 'minimol',
 'minimol',
 'minimol',
 'minimol',
 'minimol',
 'minimos',
 'minimos',
 'minimos',
 'mmfl',
 'mmst',
 'mmst',
 'mmst',
 'mmst',
 'mmst',
 'mmsttf',
 'mmsttf',
 'mmsttf',
 'mmsttf',
 'moca']

In [19]:
# create a dict strictly of toolnames and their instances
grouped_tools = tools_df.groupby(['toolname']).groups #this is a dict already; need to change the value pairs
import numpy as np
tn_instances = tools_df['instance'].to_numpy()
for i,j in grouped_tools.items():
    grouped_tools[i] = tuple(tn_instances[j])

In [20]:
print(grouped_tools)

{'': ('',), '112016a': ('112016a_dev',), '1dbtetransient': ('1dbtetransient_dev',), '1dchainmd': ('1dchainmd_dev', '1dchainmd_r13', '1dchainmd_r15'), '1dfdmht': ('1dfdmht_dev', '1dfdmht_r24', '1dfdmht_r26', '1dfdmht_r28'), '1dfs': ('1dfs_dev',), '1dhetero': ('1dhetero_dev', '1dhetero_r10', '1dhetero_r13', '1dhetero_r14', '1dhetero_r298', '1dhetero_r299', '1dhetero_r30', '1dhetero_r301', '1dhetero_r303', '1dhetero_r309', '1dhetero_r531', '1dhetero_r532', '1dhetero_r540', '1dhetero_r600', '1dhetero_r689', '1dhetero_r694', '1dhetero_r733', '1dhetero_r739', '1dhetero_r740', '1dhetero_r742'), '1dmd': ('1dmd_dev',), '1dnegf': ('1dnegf_dev',), '1doscillationla': ('1doscillationla_dev',), '1dphononbte': ('1dphononbte_dev', '1dphononbte_r5', '1dphononbte_r9'), '1sl': ('1sl_dev',), '2degpot': ('2degpot_dev',), '2dmatstacks': ('2dmatstacks_dev', '2dmatstacks_r33', '2dmatstacks_r36'), '2dreflect': ('2dreflect_dev', '2dreflect_r29', '2dreflect_r44'), '2ds': ('2ds', '2ds_dev'), '2wth': ('2wth_dev',)

In [21]:
print(list(grouped_tools.keys())[2])
print(grouped_tools[list(grouped_tools.keys())[2]])

1dbtetransient
('1dbtetransient_dev',)


In [22]:
## Run a separate query for each grouped tool
for key in list(grouped_tools.keys())[1:2]:
    print(key)
    print(grouped_tools[key])
    t_sql_tspec = "select count(user) from nanohub_metrics.toolstart where tool in " +"('1dphononbte_dev', '1dphononbte_r5', '1dphononbte_r9')"+";" #temp sql tool specific
    t_r_df = pd.read_sql_query(t_sql_tspec,nanohub_metrics_db)
    display(t_r_df.head(5))

112016a
('112016a_dev',)


Unnamed: 0,count(user)
0,1743


In [23]:
t_r_df.shape

(1, 1)

In [24]:
# Lifetime usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat')\
group by v.toolname"

toolstart_lifetime_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

# last year usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat') and t.datetime >= '%s'\
group by v.toolname" % (datetime.datetime.now()-datetime.timedelta(days=365)).strftime('%Y-%m-%d')

toolstart_last_year_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

# last 3-month usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat') and t.datetime >= '%s'\
group by v.toolname" % (datetime.datetime.now()-datetime.timedelta(days=90)).strftime('%Y-%m-%d')

toolstart_last_3_month_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

# last month usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat') and t.datetime >= '%s'\
group by v.toolname" % (datetime.datetime.now()-datetime.timedelta(days=30)).strftime('%Y-%m-%d')

toolstart_last_month_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

# last week usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat') and t.datetime >= '%s'\
group by v.toolname" % (datetime.datetime.now()-datetime.timedelta(days=7)).strftime('%Y-%m-%d')

toolstart_last_week_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

# last 24 hours usage count
sql_query = "select v.toolname, count(*) as total, count(distinct t.user) as user_total from nanohub_metrics.toolstart as t \
inner join nanohub.jos_tool_version as v \
on v.instance = t.tool \
where t.user not in ('instanton', 'gridstat') and t.datetime >= '%s'\
group by v.toolname" % (datetime.datetime.now()-datetime.timedelta(days=1)).strftime('%Y-%m-%d')

toolstart_last_24_hours_df = pd.read_sql_query(sql_query, nanohub_metrics_db)

In [25]:
# display
toolstart_last_year_df.head(2)

Unnamed: 0,toolname,total,user_total
0,1dchainmd,180,70
1,1dfdmht,210,88


In [26]:
# insert user and run stats into df
all_df = df.copy()

all_df = pd.merge(all_df, toolstart_lifetime_df.rename(columns={\
                                                                'total':        'Total_runs_over_lifetime',
                                                                'user_total':'Total_users_over_lifetime'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')
all_df = pd.merge(all_df, toolstart_last_year_df.rename(columns={\
                                                                'total':        'Total_runs_last_year',
                                                                'user_total':'Total_users_last_year'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')
all_df = pd.merge(all_df, toolstart_last_3_month_df.rename(columns={\
                                                                'total':        'Total_runs_last_3_month',
                                                                'user_total':'Total_users_last_3_month'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')
all_df = pd.merge(all_df, toolstart_last_month_df.rename(columns={\
                                                                'total':        'Total_runs_last_month',
                                                                'user_total':'Total_users_last_month'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')
all_df = pd.merge(all_df, toolstart_last_week_df.rename(columns={\
                                                                'total':        'Total_runs_last_week',
                                                                'user_total':'Total_users_last_week'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')
all_df = pd.merge(all_df, toolstart_last_24_hours_df.rename(columns={\
                                                                'total':        'Total_runs_last_24_hours',
                                                                'user_total':'Total_users_last_24_hours'}\
                                                      ), how='left', left_on='toolname', right_on='toolname')

# display
all_df.head(5).T

Unnamed: 0,0,1,2,3,4
toolname,matlab,octave,gamess,rebo,ellipsom
title,Matlab,Octave,GAMESS,REBO-1,title-ellipsom
published,0,0,0,0,0
registered,2000-10-02 02:34:37,2001-04-04 19:03:01,2003-01-15 15:41:22,2003-05-08 00:41:33,2005-03-05 03:02:47
Total_runs_over_lifetime,3470,59,2415,291,
Total_users_over_lifetime,162,30,101,42,
Total_runs_last_year,,,,,
Total_users_last_year,,,,,
Total_runs_last_3_month,,,,,
Total_users_last_3_month,,,,,


## Match data with Salesforce format

In [27]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here

df_sf['Name']              = all_df['title']
df_sf['Tool_name__c']  = all_df['toolname']
df_sf['Tool_URL__c']     = all_df['toolname'].apply(lambda x: 'https://nanohub.org/tools/'+x)
df_sf['Published__c']    = all_df['published']

# stats
df_sf['Total_runs_over_lifetime__c']  = all_df['Total_runs_over_lifetime']
df_sf['Total_runs_last_year__c']        = all_df['Total_runs_last_year']
df_sf['Total_runs_last_3_month__c'] = all_df['Total_runs_last_3_month']
df_sf['Total_runs_last_month__c']     = all_df['Total_runs_last_month']
df_sf['Total_runs_last_week__c']       = all_df['Total_runs_last_week']
df_sf['Total_runs_last_24_hours__c'] = all_df['Total_runs_last_24_hours']

df_sf['Total_users_over_lifetime__c']  = all_df['Total_users_over_lifetime']
df_sf['Total_users_last_year__c']        = all_df['Total_users_last_year']
df_sf['Total_users_last_3_month__c'] = all_df['Total_users_last_3_month']
df_sf['Total_users_last_month__c']     = all_df['Total_users_last_month']
df_sf['Total_users_last_week__c']       = all_df['Total_users_last_week']
df_sf['Total_users_last_24_hours__c'] = all_df['Total_users_last_24_hours']

# solidify time-related columns from datetime to string
df_sf['Creation_date__c']  = all_df['registered'].dt.date.fillna('').astype('str')

df_sf.fillna(0, inplace=True)

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
Name,Matlab,Octave
Tool_name__c,matlab,octave
Tool_URL__c,https://nanohub.org/tools/matlab,https://nanohub.org/tools/octave
Published__c,0,0
Total_runs_over_lifetime__c,3470,59
Total_runs_last_year__c,0,0
Total_runs_last_3_month__c,0,0
Total_runs_last_month__c,0,0
Total_runs_last_week__c,0,0
Total_runs_last_24_hours__c,0,0


## To Salesforce Sales Cloud CRM

In [28]:
db_s.object_id = object_id
db_s.external_id = external_id

Obtained Salesforce access token ...... True


In [29]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000TmJJTAA3
hello
[Success] CSV upload successful. Job ID = 7505w00000TmJJTAA3
[Success] Closing job successful. Job ID = 7505w00000TmJJTAA3


In [30]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000TmJJTAA3',
 'operation': 'upsert',
 'object': 'nanoHUB_tools__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-01-21T06:40:32.000+0000',
 'systemModstamp': '2021-01-21T06:40:34.000+0000',
 'state': 'UploadComplete',
 'externalIdFieldName': 'Tool_name__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}