# DB2-Salesforce connector: Citation mapping to users

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'Name'
object_id = 'contact_citation_asso__c'

# login parameters to be handled by Papermill
sf_login_params = None 
sql_login_params = None

In [3]:
import sys
sys.path.append(lib_dir)
    
import pandas as pd
import time
import datetime

from DB2SalesforceAPI import DB2SalesforceAPI

## Obtain tool information from DB2 

In [4]:
# connect with DB2
import sqlalchemy as sql

engine = sql.create_engine('mysql+pymysql://%s:%s@127.0.0.1/nanohub' \
                                               %(sql_login_params['username'], sql_login_params['password']))

# Citations
sql_query = "select * from jos_citations"

citations_df = pd.read_sql_query(sql_query, engine)

# Citation authors
sql_query = "select * from jos_citations_authors;"

authors_df = pd.read_sql_query(sql_query, engine)

In [5]:
# replace N/A and empty values with None
authors_df.organization.replace('N/A', '', inplace=True)

In [6]:
authors_df.columns

Index(['id', 'cid', 'author', 'authorid', 'uidNumber', 'ordering', 'givenName',
       'middleName', 'surname', 'organization', 'org_dept', 'orgtype',
       'countryresident', 'email', 'ip', 'host', 'countrySHORT', 'countryLONG',
       'ipREGION', 'ipCITY', 'ipLATITUDE', 'ipLONGITUDE', 'in_network',
       'orcid', 'research_id', 'gscholar_id', 'scopus_id', 'researchgate_id',
       'notes'],
      dtype='object')

In [7]:
# combining authors and citations
ac_df = pd.merge(citations_df[['id']], authors_df[['cid', 'uidNumber','id','organization','orcid','research_id','gscholar_id']], how='inner',\
                 left_on='id', right_on='cid', suffixes=['_c', '_a'])\
                .drop(columns='id_c')

ac_df.rename(columns={'cid':'citation_ID', 'uidNumber':'nanohub_user_id', 'id_a':'author_id'}, inplace=True)

# display
ac_df.head(3)

Unnamed: 0,citation_ID,nanohub_user_id,author_id,organization,orcid,research_id,gscholar_id
0,10000003,10676,4613,Purdue University,,,
1,10000003,0,4614,Purdue University,,,
2,10000003,37033,4615,Purdue University,,,


In [8]:
# Obtain company domain information from DB2
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country \
from wang159_myrmekes.companies_email_domain", engine)

# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first')

In [9]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/wang159/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
import re

def clean_domain(this_domain_name):
    
    if not this_domain_name:
        return None
    
    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')
        
    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]
    
    return set(filter(None, name_list))

In [11]:
# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)

# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [12]:
# clean domain name
ac_df['domain_cleaned_set'] = ac_df.organization.apply(clean_domain)

# hash cleaned domain name
ac_df['domain_cleaned_hash'] = ac_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [14]:
# get domain subset that contains domain_cleaned_hash in authors_df
domain_subset_df = domain_df[domain_df.domain_cleaned_hash.isin(ac_df.domain_cleaned_hash.unique())]\
                                        [['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[domain_subset_df.domain.notnull()&domain_subset_df.domain_cleaned_hash.notnull()]

domain_subset_all_df = domain_subset_df.rename(columns={'domain_cleaned_hash':'domain_hash'}).drop_duplicates()

In [15]:
# attempt direct join by hash
derived_authors_df = pd.merge(ac_df, domain_subset_all_df\
                              ,how='left', left_on='domain_cleaned_hash', right_on='domain_hash')\
                              .drop('domain_hash', axis=1)\
                              .rename(columns={'domain':'domain_by_citation'})

# display
derived_authors_df.sample(5).T

Unnamed: 0,5354,11005,11612,15431,9952
citation_ID,10001305.0,402,2949,42.0,1452
nanohub_user_id,0.0,0,0,0.0,0
author_id,9904.0,11777021,11776512,11773012.0,11778000
organization,,Purdue University,IMEC,,Fudan University
orcid,,,,,
research_id,,,,,
gscholar_id,,,,,
domain_cleaned_set,,"{university, purdue}",{imec},,"{university, fudan}"
domain_cleaned_hash,,purdue-university,imec,,fudan-university
domain_by_citation,,purdue.edu,imec-radiation.de,,fudan.edu.cn


In [17]:
# rank the nanoHUB profile institutions that cannot be directly matched by occurance

derived_authors_subset_df = derived_authors_df[(derived_authors_df.domain_cleaned_hash.notnull() \
                                                                & derived_authors_df.domain_by_citation.isna())]

most_common_sets = derived_authors_subset_df.domain_cleaned_set.value_counts()

# display
most_common_sets.sort_values(ascending=False)

{university, new, south, wales}                      263
{california, technology, institute}                  229
{university, delft, technology}                      153
{raytheon, systems}                                  103
{state, university, pennsylvania}                     96
                                                    ... 
{university, modena, emliia, reggio}                   1
{universita, di, bologna}                              1
{aeronautics, lockheed, martin}                        1
{pilani, bits}                                         1
{de, ciudad, autonoma, universidad, juarez, uacj}      1
Name: domain_cleaned_set, Length: 939, dtype: int64

In [18]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_authors_subset_df.domain_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set\
                             .apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

domain_subset_df = domain_subset_df[['domain_cleaned_set', 'domain']]

In [19]:
# Time consuming part: calculating Jaccard similarity score
def get_jaccard_score(a,b):
    
    if (not a) & (not b):
        return 0
    
    score = len(a.intersection(b))/len(a.union(b))
    
    return score


for this_set in most_common_sets.index:
    # calculate the Jaccard similarity
    max_index = domain_subset_df.domain_cleaned_set.apply(lambda x: get_jaccard_score(x, this_set)).idxmax()

    derived_authors_df.loc[derived_authors_df.domain_cleaned_set == this_set, 'domain_by_citation'] \
                = domain_subset_df.loc[max_index, 'domain']

In [20]:
# display
derived_authors_df.sample(5).T

Unnamed: 0,9479,13456,2142,12346,5570
citation_ID,3256,93,10000512,2420,10001356.0
nanohub_user_id,0,3001,2143,0,73985.0
author_id,11778470,11774876,6728,11775849,10120.0
organization,V. E. Lashkaryov Institute of Semiconductors P...,Intel Corp.,Purdue University,University Of Michigan,
orcid,,,,,
research_id,,,,,
gscholar_id,,,,,
domain_cleaned_set,"{e, lashkaryov, v, physics, institute, semicon...","{intel, corp}","{university, purdue}","{university, michigan}",
domain_cleaned_hash,e-institute-lashkaryov-physics-semiconductors-v,corp-intel,purdue-university,michigan-university,
domain_by_citation,collectiveleadership.com,ariasintel.com,purdue.edu,umich.edu,


## Obtain Salesforce IDs

In [21]:
# create DB2 to Salesforce API object
db_s = DB2SalesforceAPI(sf_login_params)

Obtained Salesforce access token ...... True


In [22]:
# query the Salesforce IDs for contacts and citations. when updating junction objects, these IDs must be used

# get Salesforce ID for contacts
sf_userID_df = db_s.query_data('SELECT Id, nanoHUB_user_ID__c FROM Contact where nanoHUB_user_ID__c != NULL')

sf_userID_df['nanoHUB_user_ID__c'] = sf_userID_df['nanoHUB_user_ID__c'].astype('int')

[Success] Bulk job creation successful. Job ID = 7505w00000M3BmEAAV
{"id":"7505w00000M3BmEAAV","operation":"query","object":"Contact","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:41:55.000+0000","systemModstamp":"2020-05-19T22:41:55.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000M3BmEAAV","operation":"query","object":"Contact","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:41:55.000+0000","systemModstamp":"2020-05-19T22:41:56.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}
{"id":"7505w00000M3BmEAAV","operation":"query","object":"Contact","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:41:55

In [23]:
# get Salesforce ID for citations
sf_citationID_df = db_s.query_data('SELECT Id, Record_ID__c FROM nanoHUB_citations__c')

sf_citationID_df['Record_ID__c'] = sf_citationID_df['Record_ID__c'].astype('int')

[Success] Bulk job creation successful. Job ID = 7505w00000M3C1EAAV
{"id":"7505w00000M3C1EAAV","operation":"query","object":"nanoHUB_citations__c","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:42:19.000+0000","systemModstamp":"2020-05-19T22:42:19.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000M3C1EAAV","operation":"query","object":"nanoHUB_citations__c","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:42:19.000+0000","systemModstamp":"2020-05-19T22:42:20.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":2449,"retries":0,"totalProcessingTime":314}
[Success] Bulk job completed successfully.


In [24]:
# get Salesforce ID for organizations
sf_orgID_df = db_s.query_data('SELECT Id, Domain__c FROM organization__c')

[Success] Bulk job creation successful. Job ID = 7505w00000M3C1OAAV
{"id":"7505w00000M3C1OAAV","operation":"query","object":"organization__c","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:42:30.000+0000","systemModstamp":"2020-05-19T22:42:31.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}
{"id":"7505w00000M3C1OAAV","operation":"query","object":"organization__c","createdById":"0055w00000ArpYvAAJ","createdDate":"2020-05-19T22:42:30.000+0000","systemModstamp":"2020-05-19T22:42:31.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":3462,"retries":0,"totalProcessingTime":377}
[Success] Bulk job completed successfully.


## Match data with Salesforce format

In [25]:
# valid citation - nanohub user links
ac_tolink_df = derived_authors_df[derived_authors_df.nanohub_user_id != 0]

display(ac_tolink_df.head(2))
display(sf_citationID_df.head(2))
display(sf_userID_df.head(2))
display(sf_orgID_df.head(2))

Unnamed: 0,citation_ID,nanohub_user_id,author_id,organization,orcid,research_id,gscholar_id,domain_cleaned_set,domain_cleaned_hash,domain_by_citation
0,10000003,10676,4613,Purdue University,,,,"{university, purdue}",purdue-university,purdue.edu
2,10000003,37033,4615,Purdue University,,,,"{university, purdue}",purdue-university,purdue.edu


Unnamed: 0,Id,Record_ID__c
0,a0t5w000008p7pSAAQ,1279
1,a0t5w000008p7pTAAQ,1535


Unnamed: 0,Id,nanoHUB_user_ID__c
0,0035w000031Vsp1AAC,998
1,0035w000031Vsp2AAC,1683


Unnamed: 0,Domain__c,Id
0,upenn.edu,a0r5w00000V42c0AAB
1,fer.hr,a0r5w00000V42c1AAB


In [26]:
# merge SF citation and contact IDs into user-citation DF
ac_tolink_df = pd.merge(ac_tolink_df, sf_citationID_df, how='inner', left_on='citation_ID', right_on='Record_ID__c')\
                           .rename(columns={'Id':'SF_ID_citation'})
                                            
ac_tolink_df = pd.merge(ac_tolink_df, sf_userID_df, how='inner', left_on='nanohub_user_id', right_on='nanoHUB_user_ID__c')\
                           .rename(columns={'Id':'SF_ID_contact'})

ac_tolink_df = pd.merge(ac_tolink_df, sf_orgID_df, how='left', left_on='domain_by_citation', right_on='Domain__c')\
                           .rename(columns={'Id':'SF_ID_organization'})

# display
ac_tolink_df.head(2)

Unnamed: 0,citation_ID,nanohub_user_id,author_id,organization,orcid,research_id,gscholar_id,domain_cleaned_set,domain_cleaned_hash,domain_by_citation,SF_ID_citation,Record_ID__c,SF_ID_contact,nanoHUB_user_ID__c,Domain__c,SF_ID_organization
0,2815,30286,11780986,Clarkson University,,,,"{university, clarkson}",clarkson-university,clarkson.edu,a0t5w000008p88HAAQ,2815,0035w000034JLRmAAO,30286,clarkson.edu,a0r5w00000V42nCAAR
1,2233,30286,11778497,Clarkson University,,,,"{university, clarkson}",clarkson-university,clarkson.edu,a0t5w000008p7udAAA,2233,0035w000034JLRmAAO,30286,clarkson.edu,a0r5w00000V42nCAAR


In [27]:
# create a new column for object name
ac_tolink_df['Name'] = ac_tolink_df.apply(lambda x: '%d_%d'%(x.nanoHUB_user_ID__c, x.Record_ID__c), axis=1)

In [28]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name']         = ac_tolink_df['Name']
df_sf['Contact__c'] = ac_tolink_df['SF_ID_contact']
df_sf['Citation__c'] = ac_tolink_df['SF_ID_citation']

df_sf['gscholar_id__c'] = ac_tolink_df['gscholar_id'].fillna('')
df_sf['ORCID__c'] = ac_tolink_df['orcid'].fillna('')
df_sf['Organization__c'] = ac_tolink_df['SF_ID_organization']
df_sf['Organization_nanohub__c'] = ac_tolink_df['organization'].fillna('')
df_sf['research_id__c'] = ac_tolink_df['research_id'].fillna('')

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
Name,30286_2815,30286_2233
Contact__c,0035w000034JLRmAAO,0035w000034JLRmAAO
Citation__c,a0t5w000008p88HAAQ,a0t5w000008p7udAAA
gscholar_id__c,,
ORCID__c,,
Organization__c,a0r5w00000V42nCAAR,a0r5w00000V42nCAAR
Organization_nanohub__c,Clarkson University,Clarkson University
research_id__c,,


## To Salesforce Sales Cloud CRM

In [29]:
# create DB2 to Salesforce API object
db_s = DB2SalesforceAPI(sf_login_params)

# specify Salesforce object ID and external ID
db_s.object_id = 'contact_citation_asso__c'
db_s.external_id = 'Name'

Obtained Salesforce access token ...... True


In [30]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000M3C1TAAV
[Success] CSV upload successful. Job ID = 7505w00000M3C1TAAV
[Success] Closing job successful. Job ID = 7505w00000M3C1TAAV


In [33]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_status())

{'apexProcessingTime': 0,
 'apiActiveProcessingTime': 12113,
 'apiVersion': 47.0,
 'columnDelimiter': 'COMMA',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'createdById': '0055w00000ArpYvAAJ',
 'createdDate': '2020-05-19T22:42:42.000+0000',
 'externalIdFieldName': 'Name',
 'id': '7505w00000M3C1TAAV',
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'numberRecordsFailed': 512,
 'numberRecordsProcessed': 5161,
 'object': 'contact_citation_asso__c',
 'operation': 'upsert',
 'retries': 0,
 'state': 'JobComplete',
 'systemModstamp': '2020-05-19T22:42:59.000+0000',
 'totalProcessingTime': 14743}


In [45]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

('"sf__Id","sf__Error",Citation__c,Contact__c,Name,ORCID__c,Organization__c,Organization_nanohub__c,gscholar_id__c,research_id__c\n'
 '"","DUPLICATE_VALUE:Duplicate external id specified: 65591_2813:Name '
 '--","a0t5w000008p88LAAQ","0035w000031Vu0IAAS","65591_2813","","","University '
 'of Oxford","",""\n'
 '"","DUPLICATE_VALUE:Duplicate external id specified: 65591_2813:Name '
 '--","a0t5w000008p88LAAQ","0035w000031Vu0IAAS","65591_2813","","","University '
 'of Oxford","",""\n'
 '"","DUPLICATE_VALUE:Duplicate external id specified: 65591_2248:Name '
 '--","a0t5w000008p8CXAAY","0035w000031Vu0IAAS","65591_2248","","","University '
 'of Oxford","",""\n'
 '"","DUPLICATE_VALUE:Duplicate external id specified: 65591_2248:Name '
 '--","a0t5w000008p8CXAAY","0035w000031Vu0IAAS","65591_2248","","","University '
 'of Oxford","",""\n'
 '"","DUPLICATE_VALUE:Duplicate external id specified: 55728_1017:Name '
 '--","a0t5w000008p88vAAA","0035w000034JlMjAAK","55728_1017","","","North '
 'South Univer