# DB2-Salesforce connector: Prototype Translate contact and citation user-filled organization to match Organization object

In [1]:
# Parameters
hours_range = 24*365*30 # number of days to look back

# API settings
bulk_api_threshold = 100 # if more records than this, use Bulk API instead
disallow_bulk_api = False # if Bulk API is allowed

api_url = '/services/data/v43.0/sobjects'
external_id = 'nanoHUB_user_ID__c'
object_id = 'Contact'

# login parameters to be handled by Papermill
sf_login_params = None 
sql_login_params = None

In [2]:
# Parameters
sql_login_params = {"username": "wang159_ro", "password": "napoleon0"}
sf_login_params = {
    "grant_type": "password",
    "client_id": "3MVG95jctIhbyCppj0SNJ75IsZ1y8UPGZtSNF4j8FNVXz.De8Lu4jHm3rjRosAtsHy6qjHx3i4S_QbQzvBePG",
    "client_secret": "D1623C6D3607D4FC8004B92C761DFB6C1F70CCD129C5501E357028DFA00F5764",
    "username": "wang159-4j1v@force.com",
    "password": "napoleon0eZ3PQpQqE3C3z4wWqEqKGhQ8",
}


In [3]:
import pandas as pd
import datetime

## Obtain contacts from DB2 
that have last visit date within range of interest specified by day_range

In [4]:
# Depending on the task, use different query and cutoff

# Hourly update for new registrations
date_cutoff = (datetime.datetime.today().date() - datetime.timedelta(hours=hours_range))\
                    .strftime('%Y-%m-%d')
sql_query = "select id, name, username, block, email, sendEmail, registerDate, lastvisitDate \
                    from jos_users where registerDate >= '%s'" % date_cutoff

# display
print(sql_query)

select id, name, username, block, email, sendEmail, registerDate, lastvisitDate                     from jos_users where registerDate >= '1990-05-27'


In [5]:
# connect with DB2
import sqlalchemy as sql

engine = sql.create_engine('mysql+pymysql://%s:%s@127.0.0.1/nanohub' \
                                               %(sql_login_params['username'], sql_login_params['password']))
df = pd.read_sql_query(sql_query, engine)

# get user profile details
profile_df = pd.read_sql_query("select * from jos_user_profiles where profile_key in ('orgtype', 'organization', 'orcid')", engine)

In [6]:
# display
display(df.head(1))

display(profile_df.head(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT


Unnamed: 0,id,user_id,profile_key,profile_value,ordering,access
0,1,15623,orgtype,university,2,5


In [7]:
df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orgtype'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orgtype'})

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'organization'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'organization'})

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orcid'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orcid'})

# display
display(df.head(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,,,


In [8]:
# Obtain company domain information from DB2
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country \
from wang159_myrmekes.companies_email_domain", engine)

In [9]:
# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first')

In [10]:
# get email domain of nanoHUB users
def get_domain(this_email):
    seg_list = this_email.split('@')
    
    if len(seg_list) == 2:
        return seg_list[1].lower()
    else:
        return None
    
df['email_host'] = df['email'].apply(get_domain)

## Build interpretor for user-filled Organization

In [11]:
# given data
display(domain_df.head(2))
display(df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country
0,strategic interns,strategicinterns.com,education management,1-10,india
1,bosque y comunidad,bosqueycomunidad.org,non-profit organization management,1-10,spain


Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,,,,gmail.com
1,1683,nanoHUB support,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,university,Purdue University,,nanohub.org


In [12]:
# get domain by nanoHUB organization field
df['organization'] = df.organization.astype('str').apply(str.lower)

In [13]:
df.loc[(df['organization']=='nan'),'organization'] = None

## Match using Jaccard Similarity

In [14]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/users/wang159/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [118]:
import re

def clean_domain(this_domain_name):
    
    if not this_domain_name:
        return None
    
    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')
        
    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]
    
    return set(filter(None, name_list))

In [16]:
# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)

# clean nanohub org name
df['org_cleaned_set'] = df.organization.apply(clean_domain)

In [17]:
# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

# hash cleaned clean nanohub org name
df['org_cleaned_hash'] = df['org_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

## Collaborative filtering for commerical email hosts

In [18]:
df.head(2).T

Unnamed: 0,0,1
id,998,1683
name,hubrepo hubrepo,nanoHUB support
username,hubrepo,support
block,0,0
email,nkissebe@gmail.com,support@nanohub.org
sendEmail,0,0
registerDate,2014-11-13 21:09:09,2008-11-19 22:51:04
lastvisitDate,NaT,2008-11-19 23:55:30
orgtype,,university
organization,,purdue university


In [19]:
def predict_if_commerical(this_email_host_df):
    # predict if this email host is commerical
    top_cnt = this_email_host_df['org_cleaned_hash'].value_counts()
    
    if top_cnt.shape[0] > 0:
        if (top_cnt.iloc[0]/top_cnt.sum()) < 0.2:
            return True
        else:
            return False
    
    else:
        return False
    
is_email_commerical = df[['email_host', 'org_cleaned_hash']].groupby('email_host').apply(predict_if_commerical)

In [20]:
# mark commerical email addresses
df = pd.merge(df, is_email_commerical.reset_index(name='is_email_commerical'),\
         how='left', left_on='email_host', right_on='email_host' ,)

In [58]:
# obtain domain estimation from email addresses
df = pd.merge(df, domain_df[~domain_df.domain.isin(is_email_commerical[is_email_commerical].index)]['domain'],\
         how='left', left_on='email_host', right_on='domain').rename(columns={'domain':'domain_by_email'})

# display
display(df.head(2).T)

Unnamed: 0,0,1
id,998,1683
name,hubrepo hubrepo,nanoHUB support
username,hubrepo,support
block,0,0
email,nkissebe@gmail.com,support@nanohub.org
sendEmail,0,0
registerDate,2014-11-13 21:09:09,2008-11-19 22:51:04
lastvisitDate,NaT,2008-11-19 23:55:30
orgtype,,university
organization,,purdue university


## Fuzzy derive organization from nanoHUB profiles

In [60]:
# attempt direct join by hash
domain_subset_df = domain_df[domain_df.domain.isin(df.email_host.unique())][['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[~domain_subset_df.domain.isna()&~domain_subset_df.domain_cleaned_hash.isna()]

derived_df = pd.merge(df, domain_subset_df.rename(columns={'domain':'domain_by_profile'}),\
         how='left', left_on='org_cleaned_hash', right_on='domain_cleaned_hash')\
        .drop('domain_cleaned_hash', axis=1)

# display
derived_df.sample(5).T

Unnamed: 0,42584,170277,169842,205287,5930
id,45919,202332,201786,270794,9355
name,Hai Dang Trinh,Simon Li,Eleicer Ching,Margaret W. Gitau,Taik Semeli
username,trinhhaidang,simon777,phstudiopanama,mgitau,taiksemeli
block,0,0,0,0,0
email,trinhhaidang@gmail.com,1084510637@qq.com,phstudiopanama@gmail.com,mgitau@purdue.edu,meckel2003@yahoo.com
sendEmail,1,1,0,0,0
registerDate,2010-07-25 15:18:51,2018-07-04 02:03:30,2018-06-25 21:58:51,2019-11-07 21:11:23,2005-07-16 06:48:55
lastvisitDate,2010-07-26 02:31:44,2018-07-18 02:33:57,2018-06-25 21:58:52,NaT,2005-08-29 20:42:20
orgtype,universityundergraduate,universityundergraduate,universityfaculty,universityfaculty,
organization,,,,purdue university,


In [62]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_df[(derived_df['domain_by_profile'].isna() & (~derived_df['organization'].isna()))]\
                    .org_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set\
                             .apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

In [87]:
derived_subset_df

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile
8,1690,Khaled M. Dadesh,k_dadesh,0,k_dadesh@yahoo.com,0,2000-07-05 07:07:09,NaT,,centerfor solar energy studies,,yahoo.com,"{solar, centerfor, energy, studies}",centerfor-energy-solar-studies,True,,
11,1693,tien thanh nguyen,tinguyen,0,tien.t.nguye@jpl.nasa.gov,0,2000-09-06 03:12:13,NaT,,jpl,,jpl.nasa.gov,{jpl},jpl,False,,
13,1695,Richard Siergiej,siergiej,0,siergiejrr@netzero.net,0,2000-09-15 22:04:53,NaT,,"bechtel bettis, inc.",,netzero.net,"{bechtel, inc, bettis}",bechtel-bettis-inc,False,netzero.net,
17,1699,Scott Howlett,jshowlett,0,scott_howlett@yahoo.com,0,2000-09-27 21:22:03,NaT,,cadence,,yahoo.com,{cadence},cadence,True,,
18,1700,Marisa Bauza,marisa,0,marisab@dc.uba.ar,0,2000-10-05 11:52:30,NaT,,universidad de buenos aires,,dc.uba.ar,"{universidad, buenos, aires, de}",aires-buenos-de-universidad,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218426,287988,Jeff Scully,jscully86,0,jscully86@gmail.com,1,2020-05-15 19:31:05,2020-05-16 14:53:23,universityundergraduate,university of missouri-kansas city,,gmail.com,"{university, kansas, missouri, city}",city-kansas-missouri-university,True,,
218448,288017,FotoQQ PKV,foto99,1,eynmoney@gmail.com,0,2020-05-16 07:54:45,2020-05-16 07:55:46,other,pkv games,,gmail.com,"{games, pkv}",games-pkv,True,,
218483,288063,Darrin Clawson,dclaw,0,d-claw@d-claw.com,1,2020-05-16 17:55:07,2020-05-18 19:18:22,industry,individual,,d-claw.com,{individual},individual,False,,
218596,288201,Reuben Costales Lim,reubenclim,0,reuben@wmsu.edu.ph,1,2020-05-18 14:39:05,2020-05-18 15:03:20,unemployed,philippine bioengineering & nanotechnology res...,,wmsu.edu.ph,"{nanotechnology, incorporated, bioengineering,...",bioengineering-incorporated-institute-nanotech...,False,wmsu.edu.ph,


In [98]:
def get_org(this_df):
    # look at the cohort with this label, do they have a commonly agreed domain_by_email? If so, use it
    this_dbe_count = this_df.domain_by_email.value_counts()

    if this_dbe_count.shape[0] < 1:
        return None

    this_top_dbe_perc = this_dbe_count.iloc[0]/this_dbe_count.sum()

    # if top choice occupies majority of cases, then this cohort can be implied to this domain_by_email address
    if this_top_dbe_perc > 0.5:
        return this_dbe_count.index[0] 


derived_hash = derived_subset_df.groupby('org_cleaned_hash').apply(get_org)

In [104]:
derived_df = pd.merge(derived_df, derived_hash.reset_index(name='domain_by_profile_infer'), how='left', left_on='org_cleaned_hash', right_on='org_cleaned_hash')

In [183]:
# display
derived_df.head(4).T

Unnamed: 0,0,1,2,3
id,998,1683,1684,1685
name,hubrepo hubrepo,nanoHUB support,Grid Statistics,NCN
username,hubrepo,support,gridstat,ncn
block,0,0,0,0
email,nkissebe@gmail.com,support@nanohub.org,gridstat@nanohub.org,ncn@nanohub.org
sendEmail,0,0,0,0
registerDate,2014-11-13 21:09:09,2008-11-19 22:51:04,2008-11-18 17:29:56,2008-11-11 19:17:04
lastvisitDate,NaT,2008-11-19 23:55:30,2020-02-14 18:50:14,NaT
orgtype,,university,universitystaff,university
organization,,purdue university,purdue university,purdue university


## Relating user-filled organization from citation to standard table

In [159]:
# connect with DB2
import sqlalchemy as sql

# Citation authors
sql_query = "select * from jos_citations_authors"

authors_df = pd.read_sql_query(sql_query, engine)

In [160]:
authors_df.head(2)

Unnamed: 0,id,cid,author,authorid,uidNumber,ordering,givenName,middleName,surname,organization,...,ipCITY,ipLATITUDE,ipLONGITUDE,in_network,orcid,research_id,gscholar_id,scopus_id,researchgate_id,notes
0,189134,10001576,Woody Gilbertson,0,100100,4,Woody,,Gilbertson,Purdue University,...,,,,0,,,,,,
1,189135,10001576,Hesameddin Ilatikhameneh,0,42735,5,Hesameddin,,Ilatikhameneh,Purdue University,...,,,,0,,,,,,


In [175]:
# replace N/A and empty values with None
authors_df.organization.replace('N/A', '', inplace=True)

In [176]:
# clean domain name
authors_df['domain_cleaned_set'] = authors_df.organization.apply(clean_domain)

# hash cleaned domain name
authors_df['domain_cleaned_hash'] = authors_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [177]:
# display
authors_df.head(4).T

Unnamed: 0,0,1,2,3
id,189134,189135,189132,189133
cid,10001576,10001576,10001576,10001576
author,Woody Gilbertson,Hesameddin Ilatikhameneh,Daniel F Mejia,James Charles
authorid,0,0,0,0
uidNumber,100100,42735,52349,65635
ordering,4,5,2,3
givenName,Woody,Hesameddin,Daniel,James
middleName,,,F,
surname,Gilbertson,Ilatikhameneh,Mejia,Charles
organization,Purdue University,Purdue University,Purdue University,Purdue University


In [186]:
derived_df.head(1)

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,,,,gmail.com,,,True,,,


In [193]:
domain_subset_df.rename(columns={'domain_cleaned_hash':'domain_hash'})

Unnamed: 0,domain,domain_hash
8323,rocorange.com,syracuse-university
9153,pu.edu.pk,punjab-university
13875,vt.edu,tech-virginia
17503,whoi.edu,hole-institution-oceanographic-woods
18451,washington.edu,university-washington
...,...,...
7091426,gsk.no,glaxosmithkline
7093609,armleads.com,arm-inc
7097593,ntu.edu.sg,nanyang-technological-university
7101385,furman.edu,furman-university


In [226]:
#derived_df[derived_df.domain_cleaned_hash.isin(authors_df.domain_cleaned_hash.unique())]\
#                                        [['domain', 'domain_cleaned_hash']]

#derived_df[derived_df.domain_by_profile_infer.notnull()][['org_cleaned_hash', 'domain_by_profile_infer']]\
#    .rename(columns={'org_cleaned_hash':'domain_hash', 'domain_by_profile_infer':'domain'})\
#    .stack()

# get domain subset that contains domain_cleaned_hash in authors_df
domain_subset_df = domain_df[domain_df.domain_cleaned_hash.isin(authors_df.domain_cleaned_hash.unique())][['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[domain_subset_df.domain.notnull()&domain_subset_df.domain_cleaned_hash.notnull()]

domain_subset_all_df = pd.concat([derived_df[derived_df.domain_by_profile_infer.notnull()]\
                                                              [['org_cleaned_hash', 'domain_by_profile_infer']]\
                                                              .rename(columns={'org_cleaned_hash':'domain_hash', 'domain_by_profile_infer':'domain'})\
                                                        , domain_subset_df.rename(columns={'domain_cleaned_hash':'domain_hash'})]\
                                                        , sort=False).drop_duplicates()

In [229]:
# attempt direct join by hash
derived_authors_df = pd.merge(authors_df, domain_subset_all_df\
                              ,how='left', left_on='domain_cleaned_hash', right_on='domain_hash')\
                              .drop('domain_hash', axis=1)\
                              .rename(columns={'domain':'domain_by_citation'})

# display
derived_authors_df.sample(5).T

Unnamed: 0,4532,15987,3917,5722,6629
id,8919,11771473,8324,10101,11009
cid,10001085,3,10000955,10001352,10001533
author,Changwook Jeong,M. Korkusinski,Mark S. Lundstrom,Khairul Alam,Yaohua Tan
authorid,0,10,0,0,0
uidNumber,19217,8748,2862,0,0
ordering,2,1,3,3,1
givenName,Changwook,M.,Mark,Khairul,Yaohua
middleName,,,,,
surname,Jeong,Korkusinski,Lundstrom,Alam,Tan
organization,Purdue University,Purdue University,Purdue University,University of California,Purdue University


In [237]:
# rank the nanoHUB profile institutions that cannot be directly matched by occurance

derived_authors_subset_df = derived_authors_df[(derived_authors_df.domain_cleaned_hash.notnull() \
                                                                & derived_authors_df.domain_by_citation.isna())]

most_common_sets = derived_authors_subset_df.domain_cleaned_set.value_counts()

# display
most_common_sets.head(5)

{systems, raytheon}                                           103
{systems, raytheon, ti}                                        88
{instruments, incorporated, texas}                             63
{university, engineering, technology, bangladesh}              45
{purdue, university, sciences, microstructural, institute}     35
Name: domain_cleaned_set, dtype: int64

In [271]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_authors_subset_df.domain_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set\
                             .apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

domain_subset_df = domain_subset_df[['domain_cleaned_set', 'domain']]

In [264]:
most_common_sets.head(5).index

Index([                                             {'systems', 'raytheon'},
                                              {'systems', 'raytheon', 'ti'},
                                   {'instruments', 'incorporated', 'texas'},
                  {'university', 'engineering', 'technology', 'bangladesh'},
       {'purdue', 'university', 'sciences', 'microstructural', 'institute'}],
      dtype='object')

In [290]:
domain_subset_df

Unnamed: 0,domain_cleaned_set,domain
3,"{tws, technology, inc}",talkwithsam.co
4,"{cardinal, llc, strategies}",cardinal-strategies.com
5,"{ground, sports, limited, didsbury}",didsburysportsground.co.uk
12,"{f, rsat, koltu, u}",firsatkoltugu.com
13,"{hong, kong, uil, limited}",usived.com.br
...,...,...
7173418,"{p, biuro, us, acowych, personelle, ug, kadrowo}",personelle.pl
7173420,"{home, health, valley, eastern}",evhomehealth.com
7173421,{p},rmatus.com
7173422,"{comercio, de, industria, e, peixes, pesca}",sopescabrasil.com.br


In [295]:
def get_jaccard_score(a,b):
    
    if (not a) & (not b):
        return 0
    
    score = len(a.intersection(b))/len(a.union(b))
    
    return score


for this_set in most_common_sets.head(10).index:
    # calculate the Jaccard similarity
    max_index = domain_subset_df.domain_cleaned_set.apply(lambda x: get_jaccard_score(x, this_set)).idxmax()

    derived_authors_df.loc[derived_authors_df.domain_cleaned_set == this_set, 'domain_by_citation'] \
                = domain_subset_df.loc[max_index, 'domain']

In [299]:
derived_authors_df.sample(3).T

Unnamed: 0,2294,8956,6932
id,6727,11777731,11779578
cid,10000512,2212,2006
author,Shaikh S. Ahmed,Samuel Reeve,Nitin Rathi
authorid,0,6994,5307
uidNumber,9293,85981,124877
ordering,7,0,2
givenName,Shaikh,Samuel,Nitin
middleName,,,
surname,Ahmed,Reeve,Rathi
organization,Purdue University,Purdue University,University Of South Florida


## Update citation information on Salesforce with domain information

In [None]:
from DB2SalesforceAPI import DB2SalesforceAPI

# create DB2 to Salesforce API object
db_s = DB2SalesforceAPI(sf_login_params)

# query the Salesforce IDs for contacts and citations. when updating junction objects, these IDs must be used

# get Salesforce ID for contacts
sf_cca_df = db_s.query_data('SELECT Id, nanoHUB_user_ID__c FROM contact_citation_asso__c where nanoHUB_user_ID__c != NULL')

sf_userID_df['nanoHUB_user_ID__c'] = sf_userID_df['nanoHUB_user_ID__c'].astype('int')