# DB2-Salesforce connector: Basic user information updates

In [1]:
# Parameters
hours_range = 24*365*30 # number of hours to look back

# API settings
external_id = 'nanoHUB_user_ID__c'
object_id = 'Contact'

In [2]:
# Parameters
import sqlalchemy as sql
from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')


salesforce = application.new_salesforce_engine()
db_s = salesforce


[1mnanoHUB - Serving Students, Researchers & Instructors[0m
Obtained Salesforce access token ...... True


In [3]:
import sys
import pandas as pd
import datetime

## Obtain contacts from DB2
that have last visit date within range of interest specified by day_range

In [4]:
# Hourly update for new registrations
date_cutoff = (datetime.datetime.today().date() - datetime.timedelta(hours=hours_range)).strftime('%Y-%m-%d')

In [5]:
sql_query = '''
  SELECT user_info.*
       , CASE
           WHEN 0 = LOCATE(' ', user_info.REST_OF_NAME)
             THEN NULL
           ELSE SUBSTRING(
                   user_info.REST_OF_NAME
             , 1
             , LOCATE(' ', user_info.REST_OF_NAME) - 1
             )
    END AS middle_name
       , SUBSTRING(
          user_info.REST_OF_NAME
    , 1 + LOCATE(' ', user_info.REST_OF_NAME)
    , LENGTH(user_info.REST_OF_NAME)
    )   AS last_name
    , (CASE WHEN profiles.profile_key = 'organization' THEN profiles.profile_value ELSE NULL end) organization
    , (CASE WHEN profiles.profile_key = 'orcid' THEN profiles.profile_value ELSE NULL end) orcid
    , (CASE WHEN profiles.profile_key = 'orgtype' THEN profiles.profile_value ELSE NULL end) orgtype
  FROM (
         SELECT user_id,
                username,
                block,
                email,
                sendEmail,
                registerDate,
                lastvisitDate
              , LOWER(RIGHT(email, length(email)-INSTR(email, '@'))) AS email_host
              , title.title
              , CASE
                  WHEN 0 = LOCATE(' ', title.REST_OF_NAME)
                    THEN title.REST_OF_NAME
                  ELSE SUBSTRING(
                          title.REST_OF_NAME
                    , 1
                    , LOCATE(' ', title.REST_OF_NAME) - 1
                    )
           END AS first_name
              , CASE
                  WHEN 0 = LOCATE(' ', title.REST_OF_NAME)
                    THEN NULL
                  ELSE SUBSTRING(
                          title.REST_OF_NAME
                    , LOCATE(' ', title.REST_OF_NAME) + 1
                    , LENGTH(title.REST_OF_NAME)
                    )
           END AS REST_OF_NAME
              , title.raw_full_name
         FROM (
                SELECT
                       user_id,
                       username,
                       block,
                       email,
                       sendEmail,
                       registerDate,
                       lastvisitDate
                     , CASE
                         WHEN SUBSTRING(TEST_DATA.full_name, 1, 3) IN ('PROF ', 'MR ', 'MS ', 'DR ', 'MRS ')
                           THEN LTRIM(RTRIM(SUBSTRING(TEST_DATA.full_name, 1, 3)))
                         ELSE NULL
                  END AS title
                     , CASE
                         WHEN SUBSTRING(TEST_DATA.full_name, 1, 3) IN ('PROF ', 'MR ', 'MS ', 'DR ', 'MRS ')
                           THEN LTRIM(RTRIM(SUBSTRING(TEST_DATA.full_name, 4, LENGTH(TEST_DATA.full_name))))
                         ELSE LTRIM(RTRIM(TEST_DATA.full_name))
                  END AS REST_OF_NAME
                     , TEST_DATA.raw_full_name
                FROM (
                       SELECT REPLACE(REPLACE(LTRIM(RTRIM(name)), '  ', ' '), '  ', ' ') AS full_name,
                              name                                                       AS raw_full_name,
                              user_id,
                              username,
                              block,
                              email,
                              sendEmail,
                              registerDate,
                              lastvisitDate
                       FROM (
                              SELECT
                                     id AS user_id,
                                     name,
                                     username,
                                     block,
                                     email,
                                     sendEmail,
                                     registerDate,
                                     lastvisitDate
                              FROM nanohub.jos_users
                            ) RAW_DATA
                     ) TEST_DATA
              ) title
       ) user_info
LEFT JOIN nanohub.jos_user_profiles profiles
    ON profiles.user_id = user_info.user_id
WHERE 
    user_info.username IS NOT NULL 
        AND 
    user_info.registerDate >= ' 
''' + "%s" +  "';"

sql_query = sql_query % date_cutoff 

In [6]:
df = pd.read_sql_query(sql_query, nanohub_db)
# display
display(df.head())
display(df.tail())

Unnamed: 0,user_id,username,block,email,sendEmail,registerDate,lastvisitDate,email_host,title,first_name,REST_OF_NAME,raw_full_name,middle_name,last_name,organization,orcid,orgtype
0,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,hubrepo,hubrepo,hubrepo hubrepo,,hubrepo,,,
1,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,hubrepo,hubrepo,hubrepo hubrepo,,hubrepo,,,
2,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,nanoHUB,support,nanoHUB support,,support,,,university
3,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,nanoHUB,support,nanoHUB support,,support,Purdue University,,
4,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,nanoHUB,support,nanoHUB support,,support,,,


Unnamed: 0,user_id,username,block,email,sendEmail,registerDate,lastvisitDate,email_host,title,first_name,REST_OF_NAME,raw_full_name,middle_name,last_name,organization,orcid,orgtype
1451594,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,Jack,Robert Van Sambeek,Jack Robert Van Sambeek,Robert,Van Sambeek,,,
1451595,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,Jack,Robert Van Sambeek,Jack Robert Van Sambeek,Robert,Van Sambeek,,,
1451596,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,Jack,Robert Van Sambeek,Jack Robert Van Sambeek,Robert,Van Sambeek,,,
1451597,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,Jack,Robert Van Sambeek,Jack Robert Van Sambeek,Robert,Van Sambeek,,,
1451598,347753,-186752,0,-186752@invalid,-1,2021-12-01 03:45:47,2021-12-01 03:45:48,invalid,,‍손세준(학부학생/공과대학,신소재공학),‍손세준(학부학생/공과대학 신소재공학),,신소재공학),,,


In [7]:
# Obtain company domain information from DB2
domain_df = pd.read_sql_query("select name as domain_name, domain, industry, `size range` as size, country from wang159_myrmekes.companies_email_domain", nanohub_db)

display(domain_df.head())

Unnamed: 0,domain_name,domain,industry,size,country
0,strategic interns,strategicinterns.com,education management,1-10,india
1,bosque y comunidad,bosqueycomunidad.org,non-profit organization management,1-10,spain
2,alaric as,alaric.no,management consulting,1-10,
3,tws technology inc.,talkwithsam.co,computer software,1-10,
4,"cardinal strategies, llc",cardinal-strategies.com,civil engineering,11-50,united states


In [8]:
# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first').dropna()

In [10]:
#df = pd.merge(df, domain_df[domain_df.domain.isin(df['email_host'].unique()) & ~domain_df.domain.isna()], how='left', left_on='email_host', right_on='domain')

# display
#display(df.head())
#display(df.tail())

In [11]:
# get domain by nanoHUB organization field
df['organization'] = df.organization.astype('str').apply(str.lower)

df.loc[(df['organization']=='nan'),'organization'] = None

In [12]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saxenap/venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import re

def clean_domain(this_domain_name):

    if not this_domain_name:
        return None

    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')

    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]

    return set(filter(None, name_list))

In [14]:
# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)

# clean nanohub org name
df['org_cleaned_set'] = df.organization.apply(clean_domain)

In [15]:
# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

# hash cleaned clean nanohub org name
df['org_cleaned_hash'] = df['org_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

## Collaborative filtering for commerical email hosts

In [16]:
def predict_if_commerical(this_email_host_df):
    # predict if this email host is commerical
    top_cnt = this_email_host_df['org_cleaned_hash'].value_counts()

    if top_cnt.shape[0] > 0:
        if (top_cnt.iloc[0]/top_cnt.sum()) < 0.2:
            return True
        else:
            return False

    else:
        return False

is_email_commerical = df[['email_host', 'org_cleaned_hash']].groupby('email_host').apply(predict_if_commerical)

In [17]:
# mark commerical email addresses
df = pd.merge(df, is_email_commerical.reset_index(name='is_email_commerical'), how='left', left_on='email_host', right_on='email_host' ,)

In [18]:
# obtain domain estimation from email addresses
df = pd.merge(df, domain_df[~domain_df.domain.isin(is_email_commerical[is_email_commerical].index)]['domain'], how='left', left_on='email_host', right_on='domain').rename(columns={'domain':'domain_by_email'})

# display
display(df.head(2).T)

Unnamed: 0,0,1
user_id,998,998
username,hubrepo,hubrepo
block,0,0
email,nkissebe@gmail.com,nkissebe@gmail.com
sendEmail,0,0
registerDate,2014-11-13 21:09:09,2014-11-13 21:09:09
lastvisitDate,NaT,NaT
email_host,gmail.com,gmail.com
title,,
first_name,hubrepo,hubrepo


## Fuzzy derive organization from nanoHUB profiles

In [19]:
# attempt direct join by hash
domain_subset_df = domain_df[domain_df.domain.isin(df.email_host.unique())][['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[~domain_subset_df.domain.isna()&~domain_subset_df.domain_cleaned_hash.isna()]

derived_df = pd.merge(df, domain_subset_df.rename(columns={'domain':'domain_by_profile'}),how='left', left_on='org_cleaned_hash', right_on='domain_cleaned_hash').drop('domain_cleaned_hash', axis=1)

# display
derived_df.sample(5).T

Unnamed: 0,260203,1187716,987542,1126933,294673
user_id,33762,292610,221817,279094,37399
username,pecheneg,williamlem,depannage92,frederik.hover,doodduuy
block,0,0,1,0,0
email,iv.pecheneg@berkeley.edu,befour@eauie.top,DISABLED_SPAM_bfddepannage91@gmail.com,frederik.hover@gmail.com,dukduy44@hotmail.com
sendEmail,0,1,1,-1,0
registerDate,2009-01-25 03:25:21,2020-06-24 06:47:11,2019-02-15 13:35:29,2020-02-21 03:52:56,2009-08-26 03:56:30
lastvisitDate,2012-06-14 06:01:33,2020-06-25 18:33:59,2019-02-15 13:36:17,2020-02-26 22:47:11,2009-09-11 14:26:43
email_host,berkeley.edu,eauie.top,gmail.com,gmail.com,hotmail.com
title,,,,,
first_name,Ivan,WilliamLem,Depannage,Frederik,napapol


In [20]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_df[(derived_df['domain_by_profile'].isna() & (~derived_df['organization'].isna()))].org_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set.apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

In [21]:
def get_org(this_df):
    # look at the cohort with this label, do they have a commonly agreed domain_by_email? If so, use it
    this_dbe_count = this_df.domain_by_email.value_counts()

    if this_dbe_count.shape[0] < 1:
        return None

    this_top_dbe_perc = this_dbe_count.iloc[0]/this_dbe_count.sum()

    # if top choice occupies majority of cases, then this cohort can be implied to this domain_by_email address
    if this_top_dbe_perc > 0.5:
        return this_dbe_count.index[0]


derived_hash = derived_df.groupby('org_cleaned_hash').apply(get_org)

In [22]:
derived_df = pd.merge(derived_df, derived_hash.reset_index(name='domain_by_profile_infer'), how='left', left_on='org_cleaned_hash', right_on='org_cleaned_hash')

In [23]:
# calculate a composite domain based on domain_by_email, domain_by_profile, and domain_by_profile_infer
def get_composite_domain(this_df):

    # if domain found via profile with direct match, use it
    if this_df.domain_by_profile != '':
        return this_df.domain_by_profile

    # if domain found via email, use it
    if this_df.domain_by_email != '':
        return this_df.domain_by_email

        # if domain found via profile but not with a direct match, use it
    if this_df.domain_by_profile_infer != '':
        return this_df.domain_by_profile_infer

    # else, no organization can be concluded, return none
    return None


derived_df['domain_final'] = derived_df[['domain_by_email', 'domain_by_profile', 'domain_by_profile_infer']].fillna('').apply(get_composite_domain, axis=1)

In [24]:
derived_df[derived_df.email_host == 'yahoo.com'][['domain_final', 'domain_by_email']]

Unnamed: 0,domain_final,domain_by_email
63,yahoo.com,yahoo.com
64,yahoo.com,yahoo.com
65,yahoo.com,yahoo.com
66,yahoo.com,yahoo.com
67,yahoo.com,yahoo.com
...,...,...
1452094,yahoo.com,yahoo.com
1452095,yahoo.com,yahoo.com
1452096,yahoo.com,yahoo.com
1452097,yahoo.com,yahoo.com


## Update Salesforce institution

In [25]:
# get a unique list of organizations that appears in contact
#org_df = derived_df[['domain_name', 'domain', 'industry', 'size', 'country']].drop_duplicates().dropna()
org_df = pd.merge(derived_df[['domain_final']].drop_duplicates().dropna(), domain_df, how='inner', left_on='domain_final', right_on='domain')
org_df = org_df[['domain_name', 'domain', 'industry', 'size', 'country']]

# display
display(org_df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country
0,purdue university,purdue.edu,higher education,10001+,united states
1,yahoo,yahoo.com,internet,10001+,united states


In [26]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name'] = org_df['domain_name']

df_sf['Size_Range__c'] = org_df['size'].fillna('Unknown')
df_sf['Country__c'] = org_df['country']
df_sf['Domain__c'] = org_df['domain']
df_sf['Industry__c'] = org_df['industry']

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,0,1
Name,purdue university,yahoo
Size_Range__c,10001+,10001+
Country__c,united states,united states
Domain__c,purdue.edu,yahoo.com
Industry__c,higher education,internet


In [27]:


db_s.object_id = 'organization__c'
db_s.external_id = 'Domain__c'

In [28]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000bmd0sAAA
hello
[Success] CSV upload successful. Job ID = 7505w00000bmd0sAAA
[Success] Closing job successful. Job ID = 7505w00000bmd0sAAA


In [29]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000bmd0sAAA',
 'operation': 'upsert',
 'object': 'organization__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-12-01T04:02:45.000+0000',
 'systemModstamp': '2021-12-01T04:02:46.000+0000',
 'state': 'UploadComplete',
 'externalIdFieldName': 'Domain__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [30]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''


## Obtain newly updated organization IDs from Salesforce

In [31]:
# create DB2 to Salesforce API object


In [32]:
# get Salesforce ID for organizations
sf_org_ID_df = db_s.query_data('SELECT Id, Domain__c FROM organization__c')

# display
sf_org_ID_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000bmczqAAA
{"id":"7505w00000bmczqAAA","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-12-01T04:03:14.000+0000","systemModstamp":"2021-12-01T04:03:14.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000bmczqAAA","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-12-01T04:03:14.000+0000","systemModstamp":"2021-12-01T04:03:16.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":6508,"retries":0,"totalProcessingTime":897}
[Success] Bulk job completed successfully.


Unnamed: 0,Domain__c,Id
0,upenn.edu,a0r5w00000V42c0AAB
1,fer.hr,a0r5w00000V42c1AAB
2,wheatoncollege.edu,a0r5w00000V42c2AAB


In [33]:
# join salesforce ID back to contact DF
df = pd.merge(sf_org_ID_df, derived_df, how='right', left_on='Domain__c', right_on='domain_final').drop('Domain__c', axis=1).rename(columns={'Id':'Salesforce_org_ID'})

In [34]:
# display
display(df.head(2))

Unnamed: 0,Salesforce_org_ID,user_id,username,block,email,sendEmail,registerDate,lastvisitDate,email_host,title,...,organization,orcid,orgtype,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,...,none,,,{none},none,False,,,,
1,,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,...,none,,,{none},none,False,,,,


## Match data with Salesforce format

In [39]:


# Make sure NaN and NaT values are taken care of here
df['raw_full_name'] = df.raw_full_name.str.strip()
df = df[df['raw_full_name'].notnull()]

display(df.head())
display(df.tail())

Unnamed: 0,Salesforce_org_ID,user_id,username,block,email,sendEmail,registerDate,lastvisitDate,email_host,title,...,organization,orcid,orgtype,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,...,none,,,{none},none,False,,,,
1,,998,hubrepo,0,nkissebe@gmail.com,0,2014-11-13 21:09:09,NaT,gmail.com,,...,none,,,{none},none,False,,,,
2,,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,...,none,,university,{none},none,False,,,,
3,a0r5w00000V42cCAAR,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,...,purdue university,,,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu
4,,1683,support,0,support@nanohub.org,0,2008-11-19 22:51:04,2008-11-19 23:55:30,nanohub.org,,...,none,,,{none},none,False,,,,


Unnamed: 0,Salesforce_org_ID,user_id,username,block,email,sendEmail,registerDate,lastvisitDate,email_host,title,...,organization,orcid,orgtype,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
1453077,a0r5w00000V42kYAAR,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,...,none,,,{none},none,False,mit.edu,,,mit.edu
1453078,a0r5w00000V42kYAAR,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,...,none,,,{none},none,False,mit.edu,,,mit.edu
1453079,a0r5w00000V42kYAAR,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,...,none,,,{none},none,False,mit.edu,,,mit.edu
1453080,a0r5w00000V42kYAAR,347752,jsambeek,0,jsambeek@mit.edu,0,2021-12-01 03:45:42,2021-12-01 03:45:43,mit.edu,,...,none,,,{none},none,False,mit.edu,,,mit.edu
1453081,,347753,-186752,0,-186752@invalid,-1,2021-12-01 03:45:47,2021-12-01 03:45:48,invalid,,...,none,,,{none},none,False,,,,


In [40]:
df_sf = pd.DataFrame()

df_sf['firstname']            = df['first_name']
df_sf['Middle_name__c']                = df['middle_name']
df_sf['lastname']                      = df['last_name']

display(df_sf.head())
display(df_sf.tail())

Unnamed: 0,firstname,Middle_name__c,lastname
0,hubrepo,,hubrepo
1,hubrepo,,hubrepo
2,nanoHUB,,support
3,nanoHUB,,support
4,nanoHUB,,support


Unnamed: 0,firstname,Middle_name__c,lastname
1453077,Jack,Robert,Van Sambeek
1453078,Jack,Robert,Van Sambeek
1453079,Jack,Robert,Van Sambeek
1453080,Jack,Robert,Van Sambeek
1453081,‍손세준(학부학생/공과대학,,신소재공학)


In [41]:
display(df_sf[(df_sf['firstname'].isnull()) & (df_sf['Middle_name__c'].isnull()) & (df_sf['lastname'].isnull())])

Unnamed: 0,firstname,Middle_name__c,lastname


In [42]:
df_sf = df_sf[(df_sf['firstname'].notnull()) & (df_sf['Middle_name__c'].notnull()) & (df_sf['lastname'].notnull())]
display(df_sf.head())

Unnamed: 0,firstname,Middle_name__c,lastname
60,Pedro,G.,Mireles
61,Pedro,G.,Mireles
62,Pedro,G.,Mireles
63,Khaled,M.,Dadesh
64,Khaled,M.,Dadesh


In [45]:
df_sf['nanoHUB_user_ID__c']            = df['user_id']
df_sf['nanoHUB_username__c']           = df['username']
df_sf['Email']                         = df['email'].fillna('').apply(lambda x: '' if '@invalid' in x else x).apply(lambda x: '' if '@' not in x else x)

# for sendEmail: 0 = opt-out, 1 = receive email. For salesforce HasOptedOutOfEmail, it's exact opposite
df_sf['HasOptedOutOfEmail']            = df['sendEmail'].apply(lambda x: 0 if x==1 else 1)
df_sf['nanoHUB_account_BLOCKED__c']    = df['block'].fillna(0)

# solidify time-related columns from datetime to string
df_sf['nanoHUB_registration_date__c']  = df['registerDate'].dt.date.fillna('').astype('str')
df_sf['nanoHUB_last_active_date__c']   = df['lastvisitDate'].dt.date.fillna('').astype('str')


In [46]:
# Tableau detailed view
df_sf['Detailed_user_timeline_to_Tableau__c'] = df_sf['Email'].apply(lambda x: 'https://tableauqa.itap.purdue.edu/views/profile/ProfileTimeline?Id%20Email='+x+'\
&:iframeSizedToWindow=true&:embed=y&:showAppBanner=false\
&:display_count=no&:showVizHome=no#6' if x != '' else '')

# nanoHUB user profile
df_sf['nanoHUB_user_page__c'] = df_sf['nanoHUB_user_ID__c'].apply(lambda x: 'https://nanohub.org/members/%d'%x if x != '' else '')
df_sf['Organization__c'] = df['organization'].fillna('')

df_sf['ORCID__c'] = df['orcid'].fillna('')

# derived information
df_sf['Organization_email_derived__c'] = df['Salesforce_org_ID'].fillna(' ')

sf_original_fields = df_sf.columns

# display
df_sf.head()
df_sf.tail()

Unnamed: 0,firstname,Middle_name__c,lastname,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,Organization__c,ORCID__c,Organization_email_derived__c
1453076,Jack,Robert,Van Sambeek,347752,jsambeek,jsambeek@mit.edu,1,0,2021-12-01,2021-12-01,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347752,none,,a0r5w00000V42kYAAR
1453077,Jack,Robert,Van Sambeek,347752,jsambeek,jsambeek@mit.edu,1,0,2021-12-01,2021-12-01,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347752,none,,a0r5w00000V42kYAAR
1453078,Jack,Robert,Van Sambeek,347752,jsambeek,jsambeek@mit.edu,1,0,2021-12-01,2021-12-01,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347752,none,,a0r5w00000V42kYAAR
1453079,Jack,Robert,Van Sambeek,347752,jsambeek,jsambeek@mit.edu,1,0,2021-12-01,2021-12-01,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347752,none,,a0r5w00000V42kYAAR
1453080,Jack,Robert,Van Sambeek,347752,jsambeek,jsambeek@mit.edu,1,0,2021-12-01,2021-12-01,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347752,none,,a0r5w00000V42kYAAR


## To Salesforce Sales Cloud CRM

In [47]:

db_s.object_id = object_id
db_s.external_id = external_id

In [48]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000bmd1HAAQ
hello
[Success] CSV upload successful. Job ID = 7505w00000bmd1HAAQ
[Success] Closing job successful. Job ID = 7505w00000bmd1HAAQ


In [49]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000bmd1HAAQ',
 'operation': 'upsert',
 'object': 'Contact',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-12-01T04:07:02.000+0000',
 'systemModstamp': '2021-12-01T04:08:30.000+0000',
 'state': 'UploadComplete',
 'externalIdFieldName': 'nanoHUB_user_ID__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [50]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''
