# DB2-Salesforce connector: Basic user information updates

%load_ext autoreload
%autoreload 2

In [1]:
# Parameters
hours_range = 24*365*1 # number of hours to look back
#change this back to one year, once you're done

# API settings
api_url = '/services/data/v43.0/sobjects'
external_id = 'nanoHUB_user_ID__c'
object_id = 'Contact'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')


salesforce = application.new_salesforce_engine()
db_s = salesforce

In [2]:

import pandas as pd
import datetime

## Obtain contacts from DB2 
that have last visit date within range of interest specified by day_range

In [3]:
# Depending on the task, use different query and cutoff

# Hourly update for new registrations
date_cutoff = (datetime.datetime.today().date() - datetime.timedelta(hours=hours_range))\
                    .strftime('%Y-%m-%d')
lvd_cutoff = (datetime.datetime.today().date() - datetime.timedelta(hours=24*5))\
                    .strftime('%Y-%m-%d')
sql_query = "select id, name, username, block, email, sendEmail, registerDate, lastvisitDate \
                    from jos_users where registerDate >= '{}' or lastvisitDate >= '{}'".format(date_cutoff,lvd_cutoff)

# display
print(sql_query)

select id, name, username, block, email, sendEmail, registerDate, lastvisitDate                     from jos_users where registerDate >= '2020-01-22' or lastvisitDate >= '2021-01-16'


In [4]:
df = pd.read_sql_query(sql_query, nanohub_db)

# get user profile details
profile_df = pd.read_sql_query("select * from jos_user_profiles where profile_key \
in ('orgtype', 'organization', 'orcid')", nanohub_db)

In [5]:
# display
display(df.head(2))

display(profile_df.head(2))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54
1,3858,Jing Guo,guoj,0,guoj@ufl.edu,0,2004-12-17 22:14:51,2021-01-19 21:06:55


Unnamed: 0,id,user_id,profile_key,profile_value,ordering,access
0,1,15623,orgtype,university,2,5
1,2,10060,orgtype,university,2,5


In [6]:
display(sorted(df['lastvisitDate'].to_list())[-5:])

[Timestamp('2021-01-21 04:30:54'),
 Timestamp('2021-01-21 05:05:54'),
 Timestamp('2021-01-21 05:39:52'),
 Timestamp('2021-01-21 05:42:53'),
 Timestamp('2021-01-21 06:11:11')]

In [7]:
df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orgtype'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orgtype'})

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'organization'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'organization'})
display(df.head(1))

df = pd.merge(df, profile_df[profile_df['profile_key'] == 'orcid'][['user_id', 'profile_value']],\
                         how='left', left_on='id', right_on='user_id')\
    .drop(columns=['user_id']).rename(columns={'profile_value':'orcid'})

# display
display(df.head(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,Purdue University


Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,Purdue University,0000-0001-7128-773X


In [8]:
# Obtain company domain information from DB2
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country \
from wang159_myrmekes.companies_email_domain", wang159_myrmekes_db)

In [9]:
# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first').dropna()

In [10]:
# get email domain of nanoHUB users
def get_domain(this_email):
    seg_list = this_email.split('@')
    
    if len(seg_list) == 2:
        return seg_list[1].lower()
    else:
        return None
    
df['email_host'] = df['email'].apply(get_domain)

In [11]:
# save a constuct for debugging
from copy import deepcopy
temp = deepcopy(df)

In [12]:
# get domain by nanoHUB organization field
df = deepcopy(temp)
display(df.head(2))
df['organization'] = df['organization'].str.lower()
df = df.fillna('   ')

df.loc[(df['organization']=='nan'),'organization'] = None
df.loc[(df['organization']=='NaN'),'organization'] = None

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,Purdue University,0000-0001-7128-773X,purdue.edu
1,3858,Jing Guo,guoj,0,guoj@ufl.edu,0,2004-12-17 22:14:51,2021-01-19 21:06:55,university,University of Florida,,ufl.edu


display(df.tail())

In [13]:
import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

In [14]:
import re

def clean_domain(this_domain_name):
    
    if not this_domain_name:
        return None
    
    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')
        
    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]
    
    return set(filter(None, name_list))

In [15]:
display(domain_df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country
0,strategic interns,strategicinterns.com,education management,1-10,india
1,bosque y comunidad,bosqueycomunidad.org,non-profit organization management,1-10,spain


In [16]:
# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)
display(domain_df.head(1))
# clean nanohub org name
df['org_cleaned_set'] = df.organization.apply(clean_domain)
display(df.head(1))

Unnamed: 0,domain_name,domain,industry,size,country,domain_cleaned_set
0,strategic interns,strategicinterns.com,education management,1-10,india,"{strategic, interns}"


Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,purdue university,0000-0001-7128-773X,purdue.edu,"{purdue, university}"


In [17]:
# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

# hash cleaned clean nanohub org name
df['org_cleaned_hash'] = df['org_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [18]:
display(domain_df.head(1))
display(df.head(1))

Unnamed: 0,domain_name,domain,industry,size,country,domain_cleaned_set,domain_cleaned_hash
0,strategic interns,strategicinterns.com,education management,1-10,india,"{strategic, interns}",interns-strategic


Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,purdue university,0000-0001-7128-773X,purdue.edu,"{purdue, university}",purdue-university


## Collaborative filtering for commerical email hosts

In [19]:
def predict_if_commerical(this_email_host_df):
    # predict if this email host is commerical
    top_cnt = this_email_host_df['org_cleaned_hash'].value_counts() #returns uniques in descending order
    
    if top_cnt.shape[0] > 0:
        if (top_cnt.iloc[0]/top_cnt.sum()) < 0.2:
            return True
        else:
            return False
    
    else:
        return False
    
    
is_email_commercial = df[['email_host', 'org_cleaned_hash']].groupby('email_host').apply(predict_if_commerical)

In [20]:
temp2 = deepcopy(df)

In [21]:
df = deepcopy(temp2)

In [22]:
# mark commerical email addresses
df = pd.merge(df, is_email_commercial.reset_index(name='is_email_commercial'),\
         how='left', left_on='email_host', right_on='email_host' ,)

In [23]:
display(df.tail(1))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commercial
31080,313424,reem hadaj ALqarni,reemo,0,alqarni.reem@hotmail.com,0,2021-01-21 05:40:23,2021-01-21 05:42:53,other,,,hotmail.com,{},,True


In [24]:
# obtain domain estimation from email addresses
df = pd.merge(df, domain_df[~domain_df.domain.isin(is_email_commercial[is_email_commercial].index)]['domain'],\
         how='left', left_on='email_host', right_on='domain').rename(columns={'domain':'domain_by_email'})

# display
display(df.head(2).T)

Unnamed: 0,0,1
id,3482,3858
name,Gerhard Klimeck,Jing Guo
username,gekco,guoj
block,0,0
email,gekco@purdue.edu,guoj@ufl.edu
sendEmail,1,0
registerDate,2004-04-10 17:58:53,2004-12-17 22:14:51
lastvisitDate,2021-01-19 20:10:54,2021-01-19 21:06:55
orgtype,universityfaculty,university
organization,purdue university,university of florida


## Fuzzy derive organization from nanoHUB profiles

In [25]:
# attempt direct join by hash
domain_subset_df = domain_df[domain_df.domain.isin(df.email_host.unique())][['domain', 'domain_cleaned_hash']]
# only take values that exist
domain_subset_df = domain_subset_df[~domain_subset_df.domain.isna()&~domain_subset_df.domain_cleaned_hash.isna()]

derived_df = pd.merge(df, domain_subset_df.rename(columns={'domain':'domain_by_profile'}),\
         how='left', left_on='org_cleaned_hash', right_on='domain_cleaned_hash')\
        .drop('domain_cleaned_hash', axis=1)

# display
derived_df.sample(5).T

Unnamed: 0,11995,8193,17794,3221,6394
id,290078,285420,297118,279254,283238
name,Wenjia Zhou,laura,Mitchell Mckay,Aril Sahak,MIN WOO LEE
username,wjzhouustc,lauraosh99,mmckay3,arilsahak,twins9638
block,0,0,0,0,0
email,wjzhouustc@gmail.com,lauraosh99@gmail.com,mmckay3@asu.edu,arilsahak@gmail.com,mwl96@naver.com
sendEmail,0,-1,0,-1,-1
registerDate,2020-06-03 03:07:37,2020-04-21 21:09:18,2020-08-18 04:18:48,2020-02-24 11:19:59,2020-04-05 21:29:41
lastvisitDate,2020-06-03 03:07:38,2020-04-21 21:09:19,2020-08-25 22:28:59,2020-02-24 11:19:59,2020-09-28 11:05:20
orgtype,industry,universityfaculty,universitygraduate,universitygraduate,universitygraduate
organization,,,,,


In [26]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_df[(derived_df['domain_by_profile'].isna() & (~derived_df['organization'].isna()))]\
                    .org_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set\
                             .apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

In [27]:
display(domain_subset_df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country,domain_cleaned_set,domain_cleaned_hash
4,"cardinal strategies, llc",cardinal-strategies.com,civil engineering,11-50,united states,"{cardinal, llc, strategies}",cardinal-llc-strategies
16,grail research,grailresearch.com,research,201-500,united states,"{research, grail}",grail-research


In [28]:
def get_org(this_df):
    # look at the cohort with this label, do they have a commonly agreed domain_by_email? If so, use it
    this_dbe_count = this_df.domain_by_email.value_counts()

    if this_dbe_count.shape[0] < 1:
        return None

    this_top_dbe_perc = this_dbe_count.iloc[0]/this_dbe_count.sum()

    # if top choice occupies majority of cases, then this cohort can be implied to this domain_by_email address
    if this_top_dbe_perc > 0.5:
        return this_dbe_count.index[0] 


derived_hash = derived_df.groupby('org_cleaned_hash').apply(get_org)

In [29]:
derived_df = pd.merge(derived_df, derived_hash.reset_index(name='domain_by_profile_infer')\
                      , how='left', left_on='org_cleaned_hash', right_on='org_cleaned_hash')

In [30]:
# calculate a composite domain based on domain_by_email, domain_by_profile, and domain_by_profile_infer
def get_composite_domain(this_df):

    # if domain found via profile with direct match, use it
    if this_df.domain_by_profile != '':
        return this_df.domain_by_profile
    
    # if domain found via email, use it
    if this_df.domain_by_email != '':
        return this_df.domain_by_email    
    
    # if domain found via profile but not with a direct match, use it
    if this_df.domain_by_profile_infer != '':
        return this_df.domain_by_profile_infer
    
    # else, no organization can be concluded, return none
    return None

    
derived_df['domain_final'] = derived_df[['domain_by_email', 'domain_by_profile', 'domain_by_profile_infer']].fillna('')\
                                                        .apply(get_composite_domain, axis=1)

In [31]:
display(derived_df.head(2))

Unnamed: 0,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commercial,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,3482,Gerhard Klimeck,gekco,0,gekco@purdue.edu,1,2004-04-10 17:58:53,2021-01-19 20:10:54,universityfaculty,purdue university,0000-0001-7128-773X,purdue.edu,"{purdue, university}",purdue-university,False,purdue.edu,purdue.edu,purdue.edu,purdue.edu
1,3858,Jing Guo,guoj,0,guoj@ufl.edu,0,2004-12-17 22:14:51,2021-01-19 21:06:55,university,university of florida,,ufl.edu,"{university, florida}",florida-university,False,ufl.edu,famu.edu,ufl.edu,famu.edu


## Update Salesforce institution 

In [32]:
# get a unique list of organizations that appears in contact
#org_df = derived_df[['domain_name', 'domain', 'industry', 'size', 'country']].drop_duplicates().dropna()
org_df = pd.merge(derived_df[['domain_final']].drop_duplicates().dropna(), domain_df, how='inner', left_on='domain_final', right_on='domain') 
org_df = org_df[['domain_name', 'domain', 'industry', 'size', 'country']]

# display
display(org_df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country
0,purdue university,purdue.edu,higher education,10001+,united states
1,florida a&m university,famu.edu,higher education,1001-5000,united states


In [33]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name'] = org_df['domain_name']

df_sf['Size_Range__c'] = org_df['size'].fillna('Unknown')
df_sf['Country__c'] = org_df['country']
df_sf['Domain__c'] = org_df['domain']
df_sf['Industry__c'] = org_df['industry']

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
Name,purdue university,florida a&m university
Size_Range__c,10001+,1001-5000
Country__c,united states,united states
Domain__c,purdue.edu,famu.edu
Industry__c,higher education,higher education


In [34]:


db_s.object_id = 'organization__c'
db_s.external_id = 'Domain__c'

Obtained Salesforce access token ...... True


In [35]:
display(df_sf.head(2))
display(df_sf.tail(2))


Unnamed: 0,Name,Size_Range__c,Country__c,Domain__c,Industry__c
0,purdue university,10001+,united states,purdue.edu,higher education
1,florida a&m university,1001-5000,united states,famu.edu,higher education


Unnamed: 0,Name,Size_Range__c,Country__c,Domain__c,Industry__c
946,national research university - higher school o...,1001-5000,russia,hse.ru,higher education
947,draper laboratory,1001-5000,united states,draper.com,defense & space


In [36]:
## correcting name overflows
names = df_sf['Name'].str.len()

sf_char_limit = 80
name_indexes = names.index[names >= sf_char_limit].tolist()
for index in name_indexes:
    df_sf['Name'][index] =  df_sf['Name'][index][0:sf_char_limit]

In [37]:
display(df_sf.head(2))

Unnamed: 0,Name,Size_Range__c,Country__c,Domain__c,Industry__c
0,purdue university,10001+,united states,purdue.edu,higher education
1,florida a&m university,1001-5000,united states,famu.edu,higher education


In [38]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000TmJZlAAN
hello
[Success] CSV upload successful. Job ID = 7505w00000TmJZlAAN
[Success] Closing job successful. Job ID = 7505w00000TmJZlAAN


In [39]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000TmJZlAAN',
 'operation': 'upsert',
 'object': 'organization__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-01-21T07:14:12.000+0000',
 'systemModstamp': '2021-01-21T07:14:13.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'Domain__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [40]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''


## Obtain newly updated organization IDs from Salesforce

In [41]:
# create DB2 to Salesforce API object
db_s = salesforce

Obtained Salesforce access token ...... True


In [42]:
# get Salesforce ID for organizations
sf_org_ID_df = db_s.query_data('SELECT Id, Domain__c FROM organization__c')

# display
sf_org_ID_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000TmJZqAAN
{"id":"7505w00000TmJZqAAN","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T07:14:14.000+0000","systemModstamp":"2021-01-21T07:14:15.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}
{"id":"7505w00000TmJZqAAN","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T07:14:14.000+0000","systemModstamp":"2021-01-21T07:14:16.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":6365,"retries":0,"totalProcessingTime":673}
[Success] Bulk job completed successfully.


Unnamed: 0,Domain__c,Id
0,upenn.edu,a0r5w00000V42c0AAB
1,fer.hr,a0r5w00000V42c1AAB
2,wheatoncollege.edu,a0r5w00000V42c2AAB


In [43]:
# join salesforce ID back to contact DF
df = pd.merge(sf_org_ID_df, derived_df, how='right', left_on='Domain__c', right_on='domain_final')\
            .drop('Domain__c', axis=1)\
            .rename(columns={'Id':'Salesforce_org_ID'})

In [44]:
# display
display(df.head(2))

Unnamed: 0,Salesforce_org_ID,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commercial,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,a0r5w00000V42c0AAB,278735,"Foley, Emmet T",emmetf,0,emmet@upenn.edu,0,2020-02-17 21:04:05,2020-02-17 21:04:05,universityundergraduate,,,upenn.edu,{},,False,upenn.edu,,,upenn.edu
1,a0r5w00000V42c0AAB,279901,"Tong, Yijun",yijunt,0,yijunt@upenn.edu,0,2020-03-04 01:07:38,2020-05-28 01:08:28,universitygraduate,,,upenn.edu,{},,False,upenn.edu,,,upenn.edu


## Match data with Salesforce format

In [45]:
# split full name into first, middle, and last names
def split_full_name(this_name):
    this_name_list = list(filter(None, this_name.split(' ')))
    
    if len(this_name_list) == 1:
        # single word name
        return pd.Series([this_name_list[0],None, this_name_list[0]])
    
    elif len(this_name_list) > 1:
        # multi word name
        return pd.Series([this_name_list[0],' '.join(this_name_list[1:-1]),this_name_list[-1]])

In [46]:
## stop here
display(df.head(1))

Unnamed: 0,Salesforce_org_ID,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commercial,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,a0r5w00000V42c0AAB,278735,"Foley, Emmet T",emmetf,0,emmet@upenn.edu,0,2020-02-17 21:04:05,2020-02-17 21:04:05,universityundergraduate,,,upenn.edu,{},,False,upenn.edu,,,upenn.edu


In [47]:
## if last visit date is empty - set it equal to the register date
lvd = df.lastvisitDate.index[df.lastvisitDate == '   '].tolist()

for i in lvd:
    df.iloc[i,8] = df.iloc[i,7]

display(df.head(2))

Unnamed: 0,Salesforce_org_ID,id,name,username,block,email,sendEmail,registerDate,lastvisitDate,orgtype,organization,orcid,email_host,org_cleaned_set,org_cleaned_hash,is_email_commercial,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,a0r5w00000V42c0AAB,278735,"Foley, Emmet T",emmetf,0,emmet@upenn.edu,0,2020-02-17 21:04:05,2020-02-17 21:04:05,universityundergraduate,,,upenn.edu,{},,False,upenn.edu,,,upenn.edu
1,a0r5w00000V42c0AAB,279901,"Tong, Yijun",yijunt,0,yijunt@upenn.edu,0,2020-03-04 01:07:38,2020-05-28 01:08:28,universitygraduate,,,upenn.edu,{},,False,upenn.edu,,,upenn.edu


In [48]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf[['firstname', 'Middle_name__c', 'lastname']] = df['name'].apply(split_full_name)

df_sf['nanoHUB_user_ID__c']            = df['id']
df_sf['nanoHUB_username__c']           = df['username']
df_sf['Email']                         = df['email'].fillna('').apply(lambda x: '' if '@invalid' in x else x)

# for sendEmail: 0 = opt-out, 1 = receive email. For salesforce HasOptedOutOfEmail, it's exact opposite
df_sf['HasOptedOutOfEmail']            = df['sendEmail'].apply(lambda x: 0 if x==1 else 1)
df_sf['nanoHUB_account_BLOCKED__c']    = df['block'].fillna(0)

# solidify time-related columns from datetime to string
df_sf['nanoHUB_registration_date__c']  = df['registerDate'].dt.date.fillna('').astype('str')
df_sf['nanoHUB_last_active_date__c']   = df['lastvisitDate'].dt.date.fillna('').astype('str')

# Tableau detailed view
df_sf['Detailed_user_timeline_to_Tableau__c'] = df_sf['Email'].apply(lambda x: 'https://tableauqa.itap.purdue.edu/views/profile/ProfileTimeline?Id%20Email='+x+'\
&:iframeSizedToWindow=true&:embed=y&:showAppBanner=false\
&:display_count=no&:showVizHome=no#6' if x != '' else '')

# nanoHUB user profile
df_sf['nanoHUB_user_page__c'] = df_sf['nanoHUB_user_ID__c'].apply(lambda x: 'https://nanohub.org/members/%d'%x if x != '' else '')

df_sf['ORCID__c'] = df['orcid'].fillna('')
df_sf['Organization__c'] = df['organization'].fillna('')

# derived information
df_sf['Organization_email_derived__c'] = df['Salesforce_org_ID']
df_sf['Organization_composite__c'] = df['Salesforce_org_ID']

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
firstname,"Foley,","Tong,"
Middle_name__c,Emmet,
lastname,T,Yijun
nanoHUB_user_ID__c,278735,279901
nanoHUB_username__c,emmetf,yijunt
Email,emmet@upenn.edu,yijunt@upenn.edu
HasOptedOutOfEmail,1,1
nanoHUB_account_BLOCKED__c,0,0
nanoHUB_registration_date__c,2020-02-17,2020-03-04
nanoHUB_last_active_date__c,2020-02-17,2020-05-28


In [49]:
## filtering out accounts that have positive block attributes
spam_rows = df_sf[df_sf['nanoHUB_account_BLOCKED__c'].astype(str).str.contains("1")]
good_rows = df_sf[~df_sf['nanoHUB_account_BLOCKED__c'].astype(str).str.contains('1')]

## To Salesforce Sales Cloud CRM

In [50]:

# create DB2 to Salesforce API object
db_s = salesforce

db_s.object_id = object_id
db_s.external_id = external_id

Obtained Salesforce access token ...... True


In [51]:
display(spam_rows.head(2))
display(good_rows.tail(2))

Unnamed: 0,firstname,Middle_name__c,lastname,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,ORCID__c,Organization__c,Organization_email_derived__c,Organization_composite__c
636,Mark,,Calaway,283456,markcalaway4,2r1st141@gmail.com,1,1,2020-04-07,2020-04-07,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/283456,,purdue university,a0r5w00000V42cCAAR,a0r5w00000V42cCAAR
948,John,,Neesham,295831,neeshamvm,johnneeshamvm@gmail.com,0,1,2020-08-03,2020-08-03,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/295831,,purdue university,a0r5w00000V42cCAAR,a0r5w00000V42cCAAR


Unnamed: 0,firstname,Middle_name__c,lastname,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,ORCID__c,Organization__c,Organization_email_derived__c,Organization_composite__c
31164,Sai,Saran,Dammavalam,313423,saran3072.d,saran3072.d@gmail.com,1,0,2021-01-21,2021-01-21,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/313423,,,,
31165,reem,hadaj,ALqarni,313424,reemo,alqarni.reem@hotmail.com,1,0,2021-01-21,2021-01-21,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/313424,,,,


In [52]:
# send data to Salesforce
db_s.send_data(good_rows)

[Success] Bulk job creation successful. Job ID = 7505w00000TmJX7AAN
hello
[Success] CSV upload successful. Job ID = 7505w00000TmJX7AAN
[Success] Closing job successful. Job ID = 7505w00000TmJX7AAN


In [53]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000TmJX7AAN',
 'operation': 'upsert',
 'object': 'Contact',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-01-21T07:14:37.000+0000',
 'systemModstamp': '2021-01-21T07:14:39.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'nanoHUB_user_ID__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [54]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''


In [55]:
display(good_rows.tail(2))
print(good_rows.shape)

Unnamed: 0,firstname,Middle_name__c,lastname,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,ORCID__c,Organization__c,Organization_email_derived__c,Organization_composite__c
31164,Sai,Saran,Dammavalam,313423,saran3072.d,saran3072.d@gmail.com,1,0,2021-01-21,2021-01-21,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/313423,,,,
31165,reem,hadaj,ALqarni,313424,reemo,alqarni.reem@hotmail.com,1,0,2021-01-21,2021-01-21,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/313424,,,,


(28833, 16)


## Verification
Checking to see if salesforce has updated contacts with the new users

request name and nanohub_user_id