# DB2-Salesforce connector: Determine cluster organization

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'nanoHUB_user_ID__c'
object_id = 'Contact'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

salesforce = application.new_salesforce_engine()
db_s = salesforce

[1mnanoHUB - Serving Students, Researchers & Instructors[0m


Obtained Salesforce access token ...... True


In [2]:
import pandas as pd
import datetime
import numpy as np
pd.options.mode.chained_assignment = None

## Obtain contacts from DB2 
that have last visit date within range of interest specified by day_range

In [3]:
# Depending on the task, use different query and cutoff

sql_query = "select id, name, username, block, email, sendEmail, registerDate, lastvisitDate \
                    from jos_users"

# display
print(sql_query)

select id, name, username, block, email, sendEmail, registerDate, lastvisitDate                     from jos_users


In [4]:
# connect with DB2
df = pd.read_sql_query(sql_query, nanohub_db)

# get user profile details
profile_df = pd.read_sql_query("select * from jos_user_profiles where profile_key in ('orgtype', 'organization', 'orcid')", nanohub_db)

In [5]:
# display
display(df.head(1))

display(profile_df.head(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT


Unnamed: 0,id,user_id,profile_key,profile_value,ordering,access
0,1,15623,orgtype,university,2,5


In [6]:
df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orgtype'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orgtype'})

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'organization'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'organization'})

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orcid'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orcid'})

# display
display(df.head(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,,,


In [7]:
# Obtain company domain information from DB2
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country \
from wang159_myrmekes.companies_email_domain", wang159_myrmekes_db)

In [8]:
# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first').dropna()

In [9]:
# get email domain of nanoHUB users
def get_domain(this_email):
    seg_list = this_email.split('@')
    
    if len(seg_list) == 2:
        return seg_list[1].lower()
    else:
        return None
    
df['email_domain'] = df['email'].apply(get_domain)

In [10]:
df = pd.merge(df, domain_df[domain_df.domain.isin(df['email_domain'].unique()) & ~domain_df.domain.isna()],\
         how='left', left_on='email_domain', right_on='domain')

# display
display(df.head(2))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_domain,domain_name,domain,industry,size,country
0,998,hubrepo hubrepo,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,,,,gmail.com,,,,,
1,1000,CMS Admin Manager,admin,0,support@nanohub.org,0,NaT,NaT,,,,nanohub.org,,,,,


## Obtain newly updated organization IDs from Salesforce

In [11]:
# get Salesforce ID for organizations
lim_val = 5*10**6
sf_cluster_df = db_s.query_data('SELECT Id, Contact__c, Tool_Usage_Cluster__c FROM ContactToolClusterAssociation__c limit '\
                +str(lim_val))

# display
sf_cluster_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D1VAAU


{"id":"7505w00000b2D1VAAU","operation":"query","object":"ContactToolClusterAssociation__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:13:52.000+0000","systemModstamp":"2021-10-25T11:13:52.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}


{"id":"7505w00000b2D1VAAU","operation":"query","object":"ContactToolClusterAssociation__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:13:52.000+0000","systemModstamp":"2021-10-25T11:13:54.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}


{"id":"7505w00000b2D1VAAU","operation":"query","object":"ContactToolClusterAssociation__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:13:52.000+0000","systemModstamp":"2021-10-25T11:14:09.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":98069,"retries":0,"totalProcessingTime":14851}
[Success] Bulk job completed successfully.


Unnamed: 0,Contact__c,Id,Tool_Usage_Cluster__c
0,0035w00003NeIaTAAV,a0x5w00000cRtKRAA0,a0w5w00000AXc7TAAT
1,0035w00003NeIaFAAV,a0x5w00000cRtU8AAK,a0w5w00000AXc7TAAT
2,0035w00003NeIaXAAV,a0x5w00000cRtU9AAK,a0w5w00000AXc7TAAT


In [12]:
print(sf_cluster_df.shape)

(98069, 3)


In [13]:
# get Contacts from Salesforce
sf_contact_df = db_s.query_data('SELECT Id, Name, Organization__c, Organization_composite__c, \
Organization_email_derived__c, Organization_\
Overwrite__c FROM Contact')
#db_s.query_data('SELECT Id, Organization__c, Organization_email_derived__c, Organization_Overwrite__c FROM Contact')

# display
sf_contact_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D43AAE


{"id":"7505w00000b2D43AAE","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:14:22.000+0000","systemModstamp":"2021-10-25T11:14:23.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}


{"id":"7505w00000b2D43AAE","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:14:22.000+0000","systemModstamp":"2021-10-25T11:14:23.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":14757,"retries":0,"totalProcessingTime":1511}


{"id":"7505w00000b2D43AAE","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:14:22.000+0000","systemModstamp":"2021-10-25T11:14:40.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":262499,"retries":0,"totalProcessingTime":17424}
[Success] Bulk job completed successfully.


Unnamed: 0,Id,Name,Organization_Overwrite__c,Organization__c,Organization_composite__c,Organization_email_derived__c
0,0035w000031Vsp1AAC,hubrepo hubrepo,,,,
1,0035w000031Vsp2AAC,nanoHUB support,,purdue university,a0r5w00000V42cCAAR,a0r5w00000V42cCAAR
2,0035w000031Vsp3AAC,Grid Statistics,,purdue university,a0r5w00000V42cCAAR,a0r5w00000V42cCAAR


In [14]:
# get organization from Salesforce
sf_org_df = db_s.query_data('SELECT Id, Name, Domain__c FROM organization__c')

# display
sf_org_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D48AAE


{"id":"7505w00000b2D48AAE","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:14:50.000+0000","systemModstamp":"2021-10-25T11:14:51.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}


{"id":"7505w00000b2D48AAE","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:14:50.000+0000","systemModstamp":"2021-10-25T11:14:53.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":6481,"retries":0,"totalProcessingTime":521}
[Success] Bulk job completed successfully.


Unnamed: 0,Domain__c,Id,Name
0,upenn.edu,a0r5w00000V42c0AAB,university of pennsylvania
1,fer.hr,a0r5w00000V42c1AAB,fer
2,wheatoncollege.edu,a0r5w00000V42c2AAB,wheaton college (ma)


In [15]:
# get organization according to citation from Salesforce
sf_cit_org_df = db_s.query_data('SELECT Id, Contact__c, Organization__c FROM contact_citation_asso__c')

# display
sf_cit_org_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D4IAAU


{"id":"7505w00000b2D4IAAU","operation":"query","object":"contact_citation_asso__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:15:03.000+0000","systemModstamp":"2021-10-25T11:15:03.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}


{"id":"7505w00000b2D4IAAU","operation":"query","object":"contact_citation_asso__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-10-25T11:15:03.000+0000","systemModstamp":"2021-10-25T11:15:04.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":4944,"retries":0,"totalProcessingTime":700}
[Success] Bulk job completed successfully.


Unnamed: 0,Contact__c,Id,Organization__c
0,0035w000034JLpOAAW,a0u5w00000RhVhTAAV,a0r5w00000V42cCAAR
1,0035w000034JLpOAAW,a0u5w00000RhVhUAAV,a0r5w00000V42cCAAR
2,0035w000034JLpOAAW,a0u5w00000RhVhVAAV,a0r5w00000V42cCAAR


In [16]:
# rename Id of each Salesforce DF to avoid collision
sf_cluster_df.rename(columns={'Id':'cluster_Id'}, inplace=True)
sf_org_df.rename(columns={'Id':'org_Id'}, inplace=True)
sf_contact_df.rename(columns={'Id':'contact_Id'}, inplace=True)
sf_cit_org_df.rename(columns={'Id':'cit_org_Id'}, inplace=True)

## Determine the most likely organization of each cluster

In [17]:
# data given
display('sf_cluster_df'); display(sf_cluster_df.head(1))
display('sf_org_df'); display(sf_org_df.head(1))
display('sf_contact_df'); display(sf_contact_df.head(1))
display('sf_cit_org_df'); display(sf_cit_org_df.head(1))

'sf_cluster_df'

Unnamed: 0,Contact__c,cluster_Id,Tool_Usage_Cluster__c
0,0035w00003NeIaTAAV,a0x5w00000cRtKRAA0,a0w5w00000AXc7TAAT


'sf_org_df'

Unnamed: 0,Domain__c,org_Id,Name
0,upenn.edu,a0r5w00000V42c0AAB,university of pennsylvania


'sf_contact_df'

Unnamed: 0,contact_Id,Name,Organization_Overwrite__c,Organization__c,Organization_composite__c,Organization_email_derived__c
0,0035w000031Vsp1AAC,hubrepo hubrepo,,,,


'sf_cit_org_df'

Unnamed: 0,Contact__c,cit_org_Id,Organization__c
0,0035w000034JLpOAAW,a0u5w00000RhVhTAAV,a0r5w00000V42cCAAR


In [18]:
# merge cluster with contact info
cc_df = pd.merge(sf_cluster_df, sf_contact_df, how='left', left_on='Contact__c', right_on='contact_Id')

# display
display(cc_df.head(3))

Unnamed: 0,Contact__c,cluster_Id,Tool_Usage_Cluster__c,contact_Id,Name,Organization_Overwrite__c,Organization__c,Organization_composite__c,Organization_email_derived__c
0,0035w00003NeIaTAAV,a0x5w00000cRtKRAA0,a0w5w00000AXc7TAAT,0035w00003NeIaTAAV,Noelia Moreno,,harvard university,a0r5w00000V42kEAAR,a0r5w00000V42kEAAR
1,0035w00003NeIaFAAV,a0x5w00000cRtU8AAK,a0w5w00000AXc7TAAT,0035w00003NeIaFAAV,InÃ©s Ojeda,,universidad de sevilla,,
2,0035w00003NeIaXAAV,a0x5w00000cRtU9AAK,a0w5w00000AXc7TAAT,0035w00003NeIaXAAV,Isabel JimÃ©nez,,universidad de sevilla,,


In [19]:
# apply overwrite
#cc_df['org_final'] = cc_df.fillna('').apply(lambda x: \
#                    x['Organization_email_derived__c'] if x['Organization_Overwrite__c']=='' else x['Organization_Overwrite__c'],\
#                                 axis=1)

cc_df['org_final'] = cc_df.fillna(' ').apply(lambda x: \
                    x['Organization_composite__c'] if x['Organization_Overwrite__c']==' ' else x['Organization_Overwrite__c'],\
                    axis=1)


In [20]:
display(cc_df.head(2))

Unnamed: 0,Contact__c,cluster_Id,Tool_Usage_Cluster__c,contact_Id,Name,Organization_Overwrite__c,Organization__c,Organization_composite__c,Organization_email_derived__c,org_final
0,0035w00003NeIaTAAV,a0x5w00000cRtKRAA0,a0w5w00000AXc7TAAT,0035w00003NeIaTAAV,Noelia Moreno,,harvard university,a0r5w00000V42kEAAR,a0r5w00000V42kEAAR,a0r5w00000V42kEAAR
1,0035w00003NeIaFAAV,a0x5w00000cRtU8AAK,a0w5w00000AXc7TAAT,0035w00003NeIaFAAV,InÃ©s Ojeda,,universidad de sevilla,,,


In [21]:
# find the most likely org within each cluster
def get_likely_org(this_cluster_df):
    non_empty_df = this_cluster_df[this_cluster_df.org_final != ' ']
    
    if non_empty_df.shape[0] > 0:
        # there are non-empty organization information
        return non_empty_df.org_final.mode().iloc[0]
    else:
        return ' '

        
cluster_org_df = cc_df[['Tool_Usage_Cluster__c', 'org_final']].groupby('Tool_Usage_Cluster__c')\
                            .apply(get_likely_org).reset_index(name='Organization_composite__c')#'Organization_email_derived__c')

# display
cluster_org_df.head(3)

Unnamed: 0,Tool_Usage_Cluster__c,Organization_composite__c
0,a0w5w000009Q7CuAAK,
1,a0w5w000009Q7CvAAK,a0r5w00000V42iyAAB
2,a0w5w000009Q7CwAAK,a0r5w00000V42iyAAB


In [22]:
cluster_org_df = cluster_org_df.rename(columns={"Organization_composite__c": "Organization__c"})
cluster_org_df.head(2)

Unnamed: 0,Tool_Usage_Cluster__c,Organization__c
0,a0w5w000009Q7CuAAK,
1,a0w5w000009Q7CvAAK,a0r5w00000V42iyAAB


## Write to Salesforce

In [23]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Id'] = cluster_org_df['Tool_Usage_Cluster__c']

# derived information
df_sf['Organization__c'] = cluster_org_df['Organization__c']#['Organization_email_derived__c']

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
Id,a0w5w000009Q7CuAAK,a0w5w000009Q7CvAAK
Organization__c,,a0r5w00000V42iyAAB


In [24]:
df_sf.shape

(4270, 2)

In [25]:

# create DB2 to Salesforce API object
db_s = salesforce

db_s.object_id = 'tool_usage_cluster__c'
db_s.external_id = 'Id'

In [26]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D4SAAU
hello


[Success] CSV upload successful. Job ID = 7505w00000b2D4SAAU


[Success] Closing job successful. Job ID = 7505w00000b2D4SAAU


In [27]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000b2D4SAAU',
 'operation': 'upsert',
 'object': 'tool_usage_cluster__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-10-25T11:15:21.000+0000',
 'systemModstamp': '2021-10-25T11:15:22.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'Id',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

# Graveyard
## Determine the most likely organization of each contact

### Rules

In the following order:

- If overwrite set, use overwrite
- If nanohub profile exists, use profile [NEW ADDITION]
- If citation exists, use citation
- If email exists, use email - stop here
- If cluster exists, use cluster
- If none above, none

## Match data with Salesforce format

## To Salesforce Sales Cloud CRM