# DB2-Salesforce connector: Citation mapping to leads

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'nanoHUB_source_ID__c'
object_id = 'Lead'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

salesforce = application.new_salesforce_engine()
db_s = salesforce

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


2021-08-25 23:58:02,933| INFO    | 0 keys loaded from agent


2021-08-25 23:58:02,933 - [INFO] sshtunnel.SSHTunnelForwarder [sshtunnel.get_agent_keys:1060]: 0 keys loaded from agent


2021-08-25 23:58:02,936| INFO    | 0 key(s) loaded


2021-08-25 23:58:02,936 - [INFO] sshtunnel.SSHTunnelForwarder [sshtunnel.get_keys:1117]: 0 key(s) loaded


2021-08-25 23:58:02,938| INFO    | Connecting to gateway: db2.nanohub.org:22 as user 'saxenap'


2021-08-25 23:58:02,938 - [INFO] sshtunnel.SSHTunnelForwarder [sshtunnel.__init__:978]: Connecting to gateway: db2.nanohub.org:22 as user 'saxenap'


2021-08-25 23:58:02,941 - [[1mINFO[0m] [1mnanoHUB.containers.dataaccess[0m [connection.get_connection_for:101]: Started SSH Tunnel with db2.nanohub.org


2021-08-25 23:58:03,230| INFO    | Opening tunnel: 0.0.0.0:52294 <> 127.0.0.1:3306


2021-08-25 23:58:03,230 - [INFO] sshtunnel.SSHTunnelForwarder [sshtunnel._serve_forever_wrapper:1433]: Opening tunnel: 0.0.0.0:52294 <> 127.0.0.1:3306


Obtained Salesforce access token ...... True


In [2]:
    
import pandas as pd
import time
import datetime

## Obtain tool information from DB2 

In [3]:
# Citations
sql_query = "select * from jos_citations"

citations_df = pd.read_sql_query(sql_query, nanohub_db)

# Citation authors
sql_query = "select * from jos_citations_authors;"

authors_df = pd.read_sql_query(sql_query, nanohub_db)

In [4]:
# combining authors and citations
ac_df = pd.merge(citations_df[['id', 'url', 'author', 'journal', 'year', 'title']],\
                 authors_df[['cid', 'uidNumber','id', 'author', 'email', 'organization']], \
                 how='inner',\
                 left_on='id', right_on='cid', suffixes=['_c', '_a'])\
                .drop(columns='id_c')

ac_df.rename(columns={'cid':'citation_ID', 'uidNumber':'nanohub_user_id', 'id_a':'author_id'}, inplace=True)

# display
ac_df.head(3)

Unnamed: 0,url,author_c,journal,year,title,citation_ID,nanohub_user_id,author_id,author_a,email,organization
0,,Sebastian Steiger;Michael Povolotskyi;Hong-Hyu...,IEEE Transactions on Nanotechnology,2011,NEMO5: A Parallel Multiscale Nanoelectronics M...,10000003,10676,4613,Sebastian Steiger,,Purdue University
1,,Sebastian Steiger;Michael Povolotskyi;Hong-Hyu...,IEEE Transactions on Nanotechnology,2011,NEMO5: A Parallel Multiscale Nanoelectronics M...,10000003,0,4614,Michael Povolotskyi,,Purdue University
2,,Sebastian Steiger;Michael Povolotskyi;Hong-Hyu...,IEEE Transactions on Nanotechnology,2011,NEMO5: A Parallel Multiscale Nanoelectronics M...,10000003,37033,4615,Hong-Hyun Park,,Purdue University


In [5]:
# Captialize author names
ac_df['author'] = ac_df.author_a.apply(lambda x: ' '.join(list(filter(None, [z.capitalize() for z in x.strip().split(' ')]))))

# remove empty author name entries
ac_df = ac_df[ac_df['author']!= '']

In [6]:
# author with no nanoHUB account -> leads
def determine_lead(this_df):
    # determine if this author is a lead
    
    if ((this_df.nanohub_user_id!=0).sum() > 0):
        # this author name has nanoHUB ID
        return pd.Series([False, None, None, None], index=['keep', 'email', 'org', 'source'])
    
    # affiliation
    this_org = this_df[this_df['organization'] != '']['organization'].mode()
    if this_org.shape[0] > 0:
        this_org = this_org[0]
    else:
        this_org = None
    
    # email
    this_email = this_df[this_df['email'] != '']['email'].mode()
    if this_email.shape[0] > 0:
        this_email = this_email[0]
    else:
        this_email = None
    
    # list of citations
    this_cite_list = this_df.apply(lambda x: '%s, %s (%d)' %(x.title,x.journal,x.year), axis=1)\
                                      .drop_duplicates().to_list()
    
    if len(this_cite_list) > 0:
        # remove empty strings
        this_cite_list = list(filter(None, this_cite_list))
    
    # list of URLs
    this_url_list = this_df['url'].drop_duplicates().to_list()
    
    if len(this_url_list) > 0:
        # remove empty strings
        this_url_list = list(filter(None, this_url_list))   
        
    # generate nanoHUB_source
    this_nanoHUB_source = '<p><span style="color: #ff0000;"><strong>[Automatically generated]</strong></span> from the following publications citing nanoHUB resources.</p>'
    
    for x in this_cite_list:
        this_nanoHUB_source += x+'<br />'
        
    this_nanoHUB_source += '<p></p><p>Links:</p>'
    
    for x in this_url_list:
        this_nanoHUB_source += x+'<br />' 
    
    return pd.Series([True, this_email, this_org, this_nanoHUB_source], index=['keep', 'email', 'org', 'source'])


# group by author name. Each lead has unique author name
lead_df = ac_df.groupby('author').apply(determine_lead).reset_index()

lead_df = lead_df[lead_df.keep]
                
# display
lead_df.head(3)

Unnamed: 0,author,keep,email,org,source
1,A Alhenawy,True,a.hennawy@hotmail.com,Ain Shams University,"<p><span style=""color: #ff0000;""><strong>[Auto..."
2,A Alma'aitah,True,aalmaait@uwo.ca,University Of Western Ontario,"<p><span style=""color: #ff0000;""><strong>[Auto..."
6,A Dorda,True,,Graz University of Technology,"<p><span style=""color: #ff0000;""><strong>[Auto..."


## Match data with Salesforce format

In [7]:
# split full name into first, middle, and last names
def split_full_name(this_name):
    this_name_list = list(filter(None, this_name.split(' ')))
    
    if len(this_name_list) == 1:
        # single word name
        return pd.Series([this_name_list[0], this_name_list[0]])
    
    elif len(this_name_list) > 1:
        # multi word name
        return pd.Series([this_name_list[0],this_name_list[-1]])

In [8]:
df_sf = pd.DataFrame()

# create a new column for object name
df_sf['nanoHUB_source_ID__c'] = lead_df.apply(lambda x: 'lead_citation#%s'%('_'.join(x.author.split(' '))), axis=1)
df_sf[['firstname', 'lastname']] = lead_df['author'].apply(split_full_name)
df_sf['Email'] = lead_df['email']
df_sf['Company'] = lead_df['org']
df_sf['nanoHUB_source__c'] = lead_df['source']

# Company cannot be empty
df_sf['Company'] = df_sf['Company'].fillna('-')
df_sf['Company'] = df_sf['Company'].apply(lambda x: x if len(x.strip())>0 else '-')

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,1,2
nanoHUB_source_ID__c,lead_citation#A_Alhenawy,lead_citation#A_Alma'aitah
firstname,A,A
lastname,Alhenawy,Alma'aitah
Email,a.hennawy@hotmail.com,aalmaait@uwo.ca
Company,Ain Shams University,University Of Western Ontario
nanoHUB_source__c,"<p><span style=""color: #ff0000;""><strong>[Auto...","<p><span style=""color: #ff0000;""><strong>[Auto..."


## To Salesforce Sales Cloud CRM

In [9]:
# create DB2 to Salesforce API object

# specify Salesforce object ID and external ID
db_s.object_id = object_id 
db_s.external_id = external_id

In [10]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000ZJKcXAAX
hello


[Success] CSV upload successful. Job ID = 7505w00000ZJKcXAAX


[Success] Closing job successful. Job ID = 7505w00000ZJKcXAAX


In [11]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_status())

{'apexProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apiVersion': 47.0,
 'columnDelimiter': 'COMMA',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-08-26T03:58:23.000+0000',
 'externalIdFieldName': 'nanoHUB_source_ID__c',
 'id': '7505w00000ZJKcXAAX',
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'numberRecordsFailed': 0,
 'numberRecordsProcessed': 0,
 'object': 'Lead',
 'operation': 'upsert',
 'retries': 0,
 'state': 'InProgress',
 'systemModstamp': '2021-08-26T03:58:30.000+0000',
 'totalProcessingTime': 0}


In [12]:
pprint(db_s.check_bulk_failed_results())

''
