# DB2-Salesforce connector: Citation mapping to users

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'Name'
object_id = 'contact_citation_asso__c'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')

salesforce = application.new_salesforce_engine()
db_s = salesforce

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


Obtained Salesforce access token ...... True


In [2]:
import sys
sys.path.append('/home/users/wang2506/nanohub_salesforce_integ/salesforce')
    
import pandas as pd
import time
import datetime



## Obtain tool information from DB2 

In [3]:


# Citations
sql_query = "select * from jos_citations"

citations_df = pd.read_sql_query(sql_query, nanohub_db)

# Citation authors
sql_query = "select * from jos_citations_authors;"

authors_df = pd.read_sql_query(sql_query, nanohub_db)

In [4]:
# replace N/A and empty values with None
authors_df.organization.replace('N/A', '', inplace=True)

In [5]:
authors_df.columns

Index(['id', 'cid', 'author', 'authorid', 'uidNumber', 'ordering', 'givenName',
       'middleName', 'surname', 'organization', 'org_dept', 'orgtype',
       'countryresident', 'email', 'ip', 'host', 'countrySHORT', 'countryLONG',
       'ipREGION', 'ipCITY', 'ipLATITUDE', 'ipLONGITUDE', 'in_network',
       'orcid', 'research_id', 'gscholar_id', 'scopus_id', 'researchgate_id',
       'notes'],
      dtype='object')

In [6]:
# combining authors and citations
ac_df = pd.merge(citations_df[['id']], authors_df[['cid', 'uidNumber','id','organization','orcid','research_id','gscholar_id']], how='inner',\
                 left_on='id', right_on='cid', suffixes=['_c', '_a'])\
                .drop(columns='id_c')

ac_df.rename(columns={'cid':'citation_ID', 'uidNumber':'nanohub_user_id', 'id_a':'author_id'}, inplace=True)

# display
ac_df.head(3)

Unnamed: 0,citation_ID,nanohub_user_id,author_id,organization,orcid,research_id,gscholar_id
0,10000003,10676,4613,Purdue University,,,
1,10000003,0,4614,Purdue University,,,
2,10000003,37033,4615,Purdue University,,,


In [7]:
# Obtain company domain information from DB2
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country \
from wang159_myrmekes.companies_email_domain", wang159_myrmekes_db)

# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first')

In [8]:
import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

In [9]:
import re

def clean_domain(this_domain_name):
    
    if not this_domain_name:
        return None
    
    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')
        
    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]
    
    return set(filter(None, name_list))

In [10]:
# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)

# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [11]:
# clean domain name
ac_df['domain_cleaned_set'] = ac_df.organization.apply(clean_domain)

# hash cleaned domain name
ac_df['domain_cleaned_hash'] = ac_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [12]:
# get domain subset that contains domain_cleaned_hash in authors_df
domain_subset_df = domain_df[domain_df.domain_cleaned_hash.isin(ac_df.domain_cleaned_hash.unique())]\
                                        [['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[domain_subset_df.domain.notnull()&domain_subset_df.domain_cleaned_hash.notnull()]

domain_subset_all_df = domain_subset_df.rename(columns={'domain_cleaned_hash':'domain_hash'}).drop_duplicates()

In [13]:
# attempt direct join by hash
derived_authors_df = pd.merge(ac_df, domain_subset_all_df\
                              ,how='left', left_on='domain_cleaned_hash', right_on='domain_hash')\
                              .drop('domain_hash', axis=1)\
                              .rename(columns={'domain':'domain_by_citation'})

# display
derived_authors_df.sample(5).T

Unnamed: 0,12737,15688,4496,10425,41
citation_ID,881,2602,10001124,1445,10000010
nanohub_user_id,0,0,2862,0,3482
author_id,16844391,16841621,9054,16846453,4652
organization,Universite of Lille,Thermo Fisher Scientific Inc.,Purdue University,"Univ. of Texas, Austin",Purdue University
orcid,,,,,
research_id,,,,,
gscholar_id,,,,,
domain_cleaned_set,"{lille, universite}","{thermo, inc, scientific, fisher}","{purdue, university}","{texas, univ, austin}","{purdue, university}"
domain_cleaned_hash,lille-universite,fisher-inc-scientific-thermo,purdue-university,austin-texas-univ,purdue-university
domain_by_citation,,,purdue.edu,,purdue.edu


In [14]:
# rank the nanoHUB profile institutions that cannot be directly matched by occurance

derived_authors_subset_df = derived_authors_df[(derived_authors_df.domain_cleaned_hash.notnull() \
                                                                & derived_authors_df.domain_by_citation.isna())]

most_common_sets = derived_authors_subset_df.domain_cleaned_set.value_counts()

# display
most_common_sets.sort_values(ascending=False)

TypeError: unhashable type: 'set'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas/_libs/hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'set'


{south, new, university, wales}                                           263
{institute, technology, california}                                       229
{technology, delft, university}                                           153
{systems, raytheon}                                                       103
{state, pennsylvania, university}                                          96
                                                                         ... 
{dhirubhai, technology, information, institute, communication, ambani}      1
{technical, college, forsyth}                                               1
{institute, science, technology, nano}                                      1
{inc, facebook}                                                             1
{republic, academy, institute, physics, sciences, czech}                    1
Name: domain_cleaned_set, Length: 965, dtype: int64

In [15]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_authors_subset_df.domain_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set\
                             .apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

domain_subset_df = domain_subset_df[['domain_cleaned_set', 'domain']]

In [16]:
# Time consuming part: calculating Jaccard similarity score
def get_jaccard_score(a,b):
    
    if (not a) & (not b):
        return 0
    
    score = len(a.intersection(b))/len(a.union(b))
    
    return score


for this_set in most_common_sets.index:
    # calculate the Jaccard similarity
    max_index = domain_subset_df.domain_cleaned_set.apply(lambda x: get_jaccard_score(x, this_set)).idxmax()

    derived_authors_df.loc[derived_authors_df.domain_cleaned_set == this_set, 'domain_by_citation'] \
                = domain_subset_df.loc[max_index, 'domain']

In [None]:
# display
derived_authors_df.sample(5).T

## Obtain Salesforce IDs

In [None]:
# create DB2 to Salesforce API object
db_s = salesforce

In [None]:
# query the Salesforce IDs for contacts and citations. when updating junction objects, these IDs must be used

# get Salesforce ID for contacts
sf_userID_df = db_s.query_data('SELECT Id, nanoHUB_user_ID__c FROM Contact where nanoHUB_user_ID__c != NULL')

sf_userID_df['nanoHUB_user_ID__c'] = sf_userID_df['nanoHUB_user_ID__c'].astype('int')

In [None]:
# get Salesforce ID for citations
sf_citationID_df = db_s.query_data('SELECT Id, Record_ID__c FROM nanoHUB_citations__c')

sf_citationID_df['Record_ID__c'] = sf_citationID_df['Record_ID__c'].astype('int')

In [None]:
# get Salesforce ID for organizations
sf_orgID_df = db_s.query_data('SELECT Id, Domain__c FROM organization__c')

## Match data with Salesforce format

In [None]:
# valid citation - nanohub user links
ac_tolink_df = derived_authors_df[derived_authors_df.nanohub_user_id != 0]

display(ac_tolink_df.head(2))
display(sf_citationID_df.head(2))
display(sf_userID_df.head(2))
display(sf_orgID_df.head(2))

In [None]:
# merge SF citation and contact IDs into user-citation DF
ac_tolink_df = pd.merge(ac_tolink_df, sf_citationID_df, how='inner', left_on='citation_ID', right_on='Record_ID__c')\
                           .rename(columns={'Id':'SF_ID_citation'})
                                            
ac_tolink_df = pd.merge(ac_tolink_df, sf_userID_df, how='inner', left_on='nanohub_user_id', right_on='nanoHUB_user_ID__c')\
                           .rename(columns={'Id':'SF_ID_contact'})

ac_tolink_df = pd.merge(ac_tolink_df, sf_orgID_df, how='left', left_on='domain_by_citation', right_on='Domain__c')\
                           .rename(columns={'Id':'SF_ID_organization'})

# display
ac_tolink_df.head(2)

In [None]:
# create a new column for object name
ac_tolink_df['Name'] = ac_tolink_df.apply(lambda x: '%d_%d'%(x.nanoHUB_user_ID__c, x.Record_ID__c), axis=1)

In [None]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name']         = ac_tolink_df['Name']
df_sf['Contact__c'] = ac_tolink_df['SF_ID_contact']
df_sf['Citation__c'] = ac_tolink_df['SF_ID_citation']

df_sf['gscholar_id__c'] = ac_tolink_df['gscholar_id'].fillna('')
df_sf['ORCID__c'] = ac_tolink_df['orcid'].fillna('')
df_sf['Organization__c'] = ac_tolink_df['SF_ID_organization']
df_sf['Organization_nanohub__c'] = ac_tolink_df['organization'].fillna('')
df_sf['research_id__c'] = ac_tolink_df['research_id'].fillna('')

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

## To Salesforce Sales Cloud CRM

In [None]:
# create DB2 to Salesforce API object
db_s = salesforce

# specify Salesforce object ID and external ID
db_s.object_id = 'contact_citation_asso__c'
db_s.external_id = 'Name'

In [None]:
# send data to Salesforce
db_s.send_data(df_sf)

In [None]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_status())

In [None]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())