# Mapping Tools to Authors The Right Way

##### Author: Praveen Saxena
##### Email: saxep01@gmail.com
##### Create Date: 10/27/2021
##### Purpose: Mapping tool to authors the right way

## 1. Preliminaries
[top](#Contents)

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'Name'
object_id = 'AuthorToolAssociation__c'

import pandas as pd
import os
import time
import datetime
from pathlib import Path

from nanoHUB.application import Application
from nanoHUB.salesforce import CachedRepository, ContactsRepository, ToolsRepository

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


## 2. Variables and Functions
[top](#Contents)

In [2]:
def display_for(df: pd.DataFrame, author_name: str) -> None:
    display(df.loc[df['name'] == author_name])

## 3. Base Query - Obtain tool info with authors
[top](#Contents)

In [3]:
sql_string = '''
SELECT DISTINCT 
       tool.toolname AS toolname, tool.title AS title,
       author.authorid, author.name 
FROM nanohub.jos_resources res
LEFT JOIN nanohub.jos_author_assoc author
  ON author.subid  = res.id
LEFT JOIN nanohub.jos_tool tool
  ON LOWER(tool.title) = LOWER(res.title) 
WHERE
    res.title != '' AND
    res.published = 1 AND 
    res.type = '7' AND 
    res.access IN ('0','3','1') AND 
    res.standalone = '1'
    
UNION DISTINCT
SELECT DISTINCT 
       res.alias AS toolname, res.title AS title,
       author.authorid, author.name 
FROM nanohub.jos_resources res
LEFT JOIN nanohub.jos_author_assoc author
  ON author.subid  = res.id
LEFT JOIN nanohub.jos_tool tool
  ON LOWER(tool.title) = LOWER(res.alias)
WHERE
    res.alias != '' AND
    res.published = 1 AND 
    res.type = '7' AND 
    res.access IN ('0','3','1') AND 
    res.standalone = '1'
;
'''

In [4]:
toolauthors_df = pd.read_sql_query(sql_string, nanohub_db)

display(toolauthors_df.info())
display(toolauthors_df.head())
display(toolauthors_df.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2716 entries, 0 to 2715
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   toolname  2239 non-null   object 
 1   title     2239 non-null   object 
 2   authorid  2706 non-null   float64
 3   name      2706 non-null   object 
dtypes: float64(1), object(3)
memory usage: 85.0+ KB


None

Unnamed: 0,toolname,title,authorid,name
0,hydrolab,Hydrophobicity Lab,4713.0,Eric Darve
1,hydrolab,Hydrophobicity Lab,12486.0,Artit Wangperawong
2,hydrolab,Hydrophobicity Lab,12590.0,Kazutora Hayashida
3,nanomos,NanoMOS,-39.0,
4,nanomos,NanoMOS,4323.0,


Unnamed: 0,toolname,title,authorid,name
2711,workspace,Workspace,,
2712,nrr,Non-Rigid Registration for STEM,,
2713,diffanalyzer,Particle Trajectory Diffusion Analysis,,
2714,cc3dwf,CompuCell3D - 2D wet foam coarsening,,
2715,cc3dwfdrain,CompuCell3D - 2D wet foam coarsening with drai...,,


In [5]:
toolauthors_df = toolauthors_df[pd.notnull(toolauthors_df['toolname'])]

### 3.a. Base Query - Sanity Check
[top](#Contents)

#### - _Tanya's tools_

In [6]:
display_for(toolauthors_df, 'Tanya Faltens')

Unnamed: 0,toolname,title,authorid,name
730,mosfetsat,MOSFET Simulation,29294.0,Tanya Faltens
840,mif,MIF generator for OOMMF,29294.0,Tanya Faltens


#### - _Stephen M. Goodnick's tools_

In [7]:
display_for(toolauthors_df, 'Stephen M. Goodnick')

Unnamed: 0,toolname,title,authorid,name
1993,bulkmc,Bulk Monte Carlo Lab,29476.0,Stephen M. Goodnick
2024,acute,ACUTE,29476.0,Stephen M. Goodnick


In [23]:
cache_folder = Path(os.getenv('APP_DIR'), '.cache')
salesforce_engine = application.new_salesforce_engine()

Obtained Salesforce access token ...... True


#### _Contact IDs_

In [9]:
contacts = CachedRepository(
    ContactsRepository(salesforce_engine), 
    cache_folder
)
sf_contacts_df = contacts.get_all()

display(sf_contacts_df.head())
display(sf_contacts_df.tail())
display(sf_contacts_df.info())

[Success] Bulk job creation successful. Job ID = 7505w00000b2dCwAAI
{"id":"7505w00000b2dCwAAI","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-28T03:47:40.000+0000","systemModstamp":"2021-10-28T03:47:40.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000b2dCwAAI","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-28T03:47:40.000+0000","systemModstamp":"2021-10-28T03:47:41.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":15032,"retries":0,"totalProcessingTime":898}
{"id":"7505w00000b2dCwAAI","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-28T03

Unnamed: 0,Id,nanoHUB_user_ID__c
0,0035w000031Vsp1AAC,998
1,0035w000031Vsp2AAC,1683
2,0035w000031Vsp3AAC,1684
3,0035w000031Vsp4AAC,1685
4,0035w000031Vsp5AAC,1686


Unnamed: 0,Id,nanoHUB_user_ID__c
247719,0035w00003WNV0sAAH,339872
247720,0035w00003WNVIsAAP,20159
247721,0035w00003WNVllAAH,1819
247722,0035w00003WNVlmAAH,8106
247723,0035w00003WNW5TAAX,40187


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 247724 entries, 0 to 247723
Data columns (total 2 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Id                  247724 non-null  object
 1   nanoHUB_user_ID__c  247724 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.8+ MB


None

In [10]:
display(sf_contacts_df.loc[sf_contacts_df['nanoHUB_user_ID__c'] == 29476])

Unnamed: 0,Id,nanoHUB_user_ID__c
105293,0035w000034JKc2AAG,29476


#### _Tool IDs_

In [11]:
tools = CachedRepository(
    ToolsRepository(salesforce_engine), 
    cache_folder
)
sf_tool_df = tools.get_all()
sf_tool_df['Tool_name__c'] = sf_tool_df['Tool_name__c'].str.lower()

display(sf_tool_df.info())
display(sf_tool_df.head())
display(sf_tool_df.tail())

[Success] Bulk job creation successful. Job ID = 7505w00000b2dD1AAI
{"id":"7505w00000b2dD1AAI","operation":"query","object":"nanoHUB_tools__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-28T03:48:07.000+0000","systemModstamp":"2021-10-28T03:48:08.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":1827,"retries":0,"totalProcessingTime":213}
[Success] Bulk job completed successfully.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1827 entries, 0 to 1826
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            1827 non-null   object
 1   Tool_name__c  1827 non-null   object
dtypes: object(2)
memory usage: 28.7+ KB


None

Unnamed: 0,Id,Tool_name__c
0,a0s5w00000k5MOwAAM,ellipsom
1,a0s5w00000k5MOxAAM,hydrolab
2,a0s5w00000k5MOyAAM,huckel
3,a0s5w00000k5MOzAAM,nanomos
4,a0s5w00000k5MP0AAM,sete


Unnamed: 0,Id,Tool_name__c
1822,a0s5w00000mi2zOAAQ,myemailverifier
1823,a0s5w00000mi3laAAA,dislocateqtb
1824,a0s5w00000mi3zpAAA,optprop
1825,a0s5w00000mi4WfAAI,selfies
1826,a0s5w00000mi4zBAAQ,kssolv


In [12]:
ct_tolink_df = pd.merge(toolauthors_df, sf_tool_df, how='inner', left_on='toolname', right_on='Tool_name__c')\
                          .rename(columns={'Id':'SF_ID_tool'})                                            
ct_tolink_df = pd.merge(ct_tolink_df, sf_contacts_df, how='inner', left_on='authorid', right_on='nanoHUB_user_ID__c')\
                           .rename(columns={'Id':'SF_ID_contact'})
ct_tolink_df['Name'] = ct_tolink_df.apply(lambda x: '%s_%s'%(x.nanoHUB_user_ID__c, x.Tool_name__c), axis=1)

display(ct_tolink_df.head())

Unnamed: 0,toolname,title,authorid,name,SF_ID_tool,Tool_name__c,SF_ID_contact,nanoHUB_user_ID__c,Name
0,hydrolab,Hydrophobicity Lab,4713.0,Eric Darve,a0s5w00000k5MOxAAM,hydrolab,0035w000034JHMwAAO,4713,4713_hydrolab
1,hydrolab,Hydrophobicity Lab,12486.0,Artit Wangperawong,a0s5w00000k5MOxAAM,hydrolab,0035w000034JNV0AAO,12486,12486_hydrolab
2,hydrolab,Hydrophobicity Lab,12590.0,Kazutora Hayashida,a0s5w00000k5MOxAAM,hydrolab,0035w000034JNWdAAO,12590,12590_hydrolab
3,nanomos,NanoMOS,4323.0,,a0s5w00000k5MOzAAM,nanomos,0035w000034JGxWAAW,4323,4323_nanomos
4,moscap,MOSCap,4323.0,Akira Matsudaira,a0s5w00000k5MPaAAM,moscap,0035w000034JGxWAAW,4323,4323_moscap


In [13]:
display(sf_tool_df.loc[sf_tool_df['Tool_name__c'] == 'acute'])

Unnamed: 0,Id,Tool_name__c
254,a0s5w00000k5MT2AAM,acute


In [14]:
display(ct_tolink_df.loc[ct_tolink_df['SF_ID_tool'] == '0035w000034JHCCAA4'])

Unnamed: 0,toolname,title,authorid,name,SF_ID_tool,Tool_name__c,SF_ID_contact,nanoHUB_user_ID__c,Name


In [15]:
display_for(ct_tolink_df, 'Tanya Faltens')

Unnamed: 0,toolname,title,authorid,name,SF_ID_tool,Tool_name__c,SF_ID_contact,nanoHUB_user_ID__c,Name
1221,mosfetsat,MOSFET Simulation,29294.0,Tanya Faltens,a0s5w00000k5Mc5AAE,mosfetsat,0035w000034JKPQAA4,29294,29294_mosfetsat
1222,mif,MIF generator for OOMMF,29294.0,Tanya Faltens,a0s5w00000k5MeDAAU,mif,0035w000034JKPQAA4,29294,29294_mif


In [16]:
display_for(ct_tolink_df, 'Stephen M. Goodnick')

Unnamed: 0,toolname,title,authorid,name,SF_ID_tool,Tool_name__c,SF_ID_contact,nanoHUB_user_ID__c,Name
1926,bulkmc,Bulk Monte Carlo Lab,29476.0,Stephen M. Goodnick,a0s5w00000k5MSLAA2,bulkmc,0035w000034JKc2AAG,29476,29476_bulkmc
1927,acute,ACUTE,29476.0,Stephen M. Goodnick,a0s5w00000k5MT2AAM,acute,0035w000034JKc2AAG,29476,29476_acute


In [17]:
display(ct_tolink_df.loc[ct_tolink_df['authorid'] == 29476])

Unnamed: 0,toolname,title,authorid,name,SF_ID_tool,Tool_name__c,SF_ID_contact,nanoHUB_user_ID__c,Name
1926,bulkmc,Bulk Monte Carlo Lab,29476.0,Stephen M. Goodnick,a0s5w00000k5MSLAA2,bulkmc,0035w000034JKc2AAG,29476,29476_bulkmc
1927,acute,ACUTE,29476.0,Stephen M. Goodnick,a0s5w00000k5MT2AAM,acute,0035w000034JKc2AAG,29476,29476_acute


## 5. Salesforce - Save Final Data
[top](#Contents)

In [18]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name']         = ct_tolink_df['Name']
df_sf['Contact__c'] = ct_tolink_df['SF_ID_contact']
df_sf['Tool__c']       = ct_tolink_df['SF_ID_tool']
sf_original_fields = df_sf.columns

display(df_sf.head().T)

Unnamed: 0,0,1,2,3,4
Name,4713_hydrolab,12486_hydrolab,12590_hydrolab,4323_nanomos,4323_moscap
Contact__c,0035w000034JHMwAAO,0035w000034JNV0AAO,0035w000034JNWdAAO,0035w000034JGxWAAW,0035w000034JGxWAAW
Tool__c,a0s5w00000k5MOxAAM,a0s5w00000k5MOxAAM,a0s5w00000k5MOxAAM,a0s5w00000k5MOzAAM,a0s5w00000k5MPaAAM


In [22]:
display(df_sf.loc[df_sf['Contact__c'] == '0035w000034JKPQAA4'])

Unnamed: 0,Name,Contact__c,Tool__c
1221,29294_mosfetsat,0035w000034JKPQAA4,a0s5w00000k5Mc5AAE
1222,29294_mif,0035w000034JKPQAA4,a0s5w00000k5MeDAAU


In [19]:
# create DB2 to Salesforce API object
db_s = salesforce_engine

db_s.object_id = object_id
db_s.external_id = external_id

In [20]:
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000b2dD6AAI
hello
[Success] CSV upload successful. Job ID = 7505w00000b2dD6AAI
[Success] Closing job successful. Job ID = 7505w00000b2dD6AAI


In [21]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_status())

{'apexProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apiVersion': 47.0,
 'columnDelimiter': 'COMMA',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-10-28T03:48:09.000+0000',
 'externalIdFieldName': 'Name',
 'id': '7505w00000b2dD6AAI',
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'numberRecordsFailed': 0,
 'numberRecordsProcessed': 0,
 'object': 'AuthorToolAssociation__c',
 'operation': 'upsert',
 'retries': 0,
 'state': 'InProgress',
 'systemModstamp': '2021-10-28T03:48:10.000+0000',
 'totalProcessingTime': 0}
