# DB2-Salesforce connector: Basic user information updates

In [1]:
# Parameters
hours_range = 24*365*30 # number of hours to look back

# API settings
external_id = 'nanoHUB_user_ID__c'
object_id = 'Contact'

In [2]:
# Parameters
import sqlalchemy as sql
from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')

salesforce = application.new_salesforce_engine()
db_s = salesforce


[1mnanoHUB - Serving Students, Researchers & Instructors[0m
Obtained Salesforce access token ...... True


In [3]:
from nanoHUB.logger import logger
log = logger('task_user_basic_updates')

In [4]:
import sys
import pandas as pd
import datetime

In [5]:
def get_df_for_yesterday(df: pd.DataFrame, column_name: str = 'registerDate'): 
    df[column_name] = pd.to_datetime(df[column_name])
    todays_date = df[column_name].dt.date.max() - datetime.timedelta(days=1)
    return get_df_for(todays_date, df, column_name)

def get_df_for(for_date: datetime, df: pd.DataFrame, column_name: str = 'registerDate'):
    df[column_name] = pd.to_datetime(df[column_name])
    return df[df[column_name].dt.date == for_date]

def get_number_of_registered_users_for(for_date: datetime, df: pd.DataFrame, column_name: str = 'registerDate'):
    df = get_df_for(for_date, df, column_name)
    log.info("Number of users who registered on (%s) is: %d" %(for_date, len(df.index)))
    return len(df.index)
    
def get_number_of_registered_users_for_yesterday(df: pd.DataFrame, column_name: str = 'registerDate'):
    df[column_name] = pd.to_datetime(df[column_name])
    todays_date = df[column_name].dt.date.max() - datetime.timedelta(days=1)
    return get_number_of_registered_users_for(todays_date, df, column_name)

## Obtain contacts from DB2
that have last visit date within range of interest specified by day_range

In [6]:
# Hourly update for new registrations
date_cutoff = (datetime.datetime.today().date() - datetime.timedelta(hours=hours_range)).strftime('%Y-%m-%d')
display(date_cutoff)

'1991-12-10'

In [7]:
salutations = [
    'PROF ', 
    'MR ', 
    'MS ', 
    'DR ', 
    'MRS '
]
salutations_str = ', '.join(list(map(lambda x: '%s', salutations)))

display(salutations_str)

'%s, %s, %s, %s, %s'

In [8]:
sql_query = r"""
SELECT user_id
     , username
     , registerDate
     , lastvisitDate
     , raw_full_name
     , title
     , first_name
     , middle_name
     , last_name
     , LOWER(MAX(calc_data.organization)) AS organization
     , LOWER(MAX(calc_data.orgtype))      AS orgtype
     , LOWER(MAX(calc_data.orcid))        AS orcid
     , email
     , LOWER(RIGHT(email, length(email)-INSTR(email, '@'))) AS email_host
     , block
     , sendEmail
FROM (
       SELECT user_info.*
            , CASE
                WHEN 0 = LOCATE(' ', user_info.REST_OF_NAME)
                  THEN NULL
                ELSE SUBSTRING(
                        user_info.REST_OF_NAME
                  , 1
                  , LOCATE(' ', user_info.REST_OF_NAME) - 1
                  )
         END AS                                                                                 middle_name
            , SUBSTRING(
               user_info.REST_OF_NAME
         , 1 + LOCATE(' ', user_info.REST_OF_NAME)
         , LENGTH(user_info.REST_OF_NAME)
         )   AS                                                                                 last_name
            , (CASE WHEN profiles.profile_key = 'organization' THEN profiles.profile_value end) organization
            , (CASE WHEN profiles.profile_key = 'orcid' THEN profiles.profile_value end)        orcid
            , (CASE WHEN profiles.profile_key = 'orgtype' THEN profiles.profile_value end)      orgtype
  
       FROM (
              SELECT user_id,
                     username,
                     block,
                     email,
                     sendEmail,
                     registerDate,
                     lastvisitDate
                      ,
                     title.title
                      ,
                     CASE
                       WHEN 0 = LOCATE(' ', title.REST_OF_NAME)
                         THEN title.REST_OF_NAME
                       ELSE SUBSTRING(
                               title.REST_OF_NAME
                         , 1
                         , LOCATE(' ', title.REST_OF_NAME) - 1
                         )
                       END AS first_name
                      ,
                     CASE
                       WHEN 0 = LOCATE(' ', title.REST_OF_NAME)
                         THEN NULL
                       ELSE SUBSTRING(
                               title.REST_OF_NAME
                         , LOCATE(' ', title.REST_OF_NAME) + 1
                         , LENGTH(title.REST_OF_NAME)
                         )
                       END AS REST_OF_NAME
                      ,
                     title.raw_full_name
              FROM (
                     SELECT user_id,
                            username,
                            block,
                            email,
                            sendEmail,
                            registerDate,
                            lastvisitDate
                             ,
                            CASE
                              WHEN SUBSTRING(TEST_DATA.full_name, 1, 3) IN (%s)
                                THEN LTRIM(RTRIM(SUBSTRING(TEST_DATA.full_name, 1, 3)))
                              ELSE NULL
                              END AS title
                             ,
                            CASE
                              WHEN SUBSTRING(TEST_DATA.full_name, 1, 3) IN (%s)
                                THEN LTRIM(RTRIM(SUBSTRING(TEST_DATA.full_name, 4, LENGTH(TEST_DATA.full_name))))
                              ELSE LTRIM(RTRIM(TEST_DATA.full_name))
                              END AS REST_OF_NAME
                             ,
                            TEST_DATA.raw_full_name
                     FROM (
                            SELECT REPLACE(REPLACE(LTRIM(RTRIM(name)), '  ', ' '), '  ', ' ') AS full_name,
                                   name                                                       AS raw_full_name,
                                   user_id,
                                   username,
                                   block,
                                   email,
                                   sendEmail,
                                   registerDate,
                                   lastvisitDate
                            FROM (
                                   SELECT id AS user_id,
                                          name,
                                          username,
                                          block,
                                          email,
                                          sendEmail,
                                          registerDate,
                                          lastvisitDate
                                   FROM nanohub.jos_users
                                   WHERE registerDate >= %%s
                                 ) RAW_DATA
                          ) TEST_DATA
                   ) title
            ) user_info
              LEFT JOIN nanohub.jos_user_profiles profiles
                        ON profiles.user_id = user_info.user_id
                          AND profiles.profile_key IN ('orgtype', 'organization', 'orcid')
     ) AS calc_data
GROUP BY calc_data.user_id
;
"""

sql_query = sql_query % (salutations_str, salutations_str)
# display(sql_query)
# sql_query = sql_query % date_cutoff 

In [9]:
df = pd.read_sql_query(
        sql_query, 
        nanohub_db, 
        params=salutations + salutations + [date_cutoff],
        parse_dates=['registerDate', 'lastvisitDate']
)
# display
display(df.head())
display(df.tail())

Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,orgtype,orcid,email,email_host,block,sendEmail
0,998,hubrepo,2014-11-13 21:09:09,NaT,hubrepo hubrepo,,hubrepo,,hubrepo,,,,nkissebe@gmail.com,gmail.com,0,0
1,1683,support,2008-11-19 22:51:04,2008-11-19 23:55:30,nanoHUB support,,nanoHUB,,support,purdue university,university,,support@nanohub.org,nanohub.org,0,0
2,1684,gridstat,2008-11-18 17:29:56,2020-02-14 18:50:14,Grid Statistics,,Grid,,Statistics,purdue university,universitystaff,,gridstat@nanohub.org,nanohub.org,0,0
3,1685,ncn,2008-11-11 19:17:04,NaT,NCN NCN,,NCN,,NCN,purdue university,,,ncn@nanohub.org,nanohub.org,0,0
4,1686,nanohub,2014-06-26 19:38:57,NaT,nanoHUB nanoHUB,,nanoHUB,,nanoHUB,purdue university,,,apps@nanohub.org,nanohub.org,0,0


Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,orgtype,orcid,email,email_host,block,sendEmail
265584,347898,richardwhite33,2021-12-02 15:07:51,2021-12-02 15:10:36,Richard White,,Richard,,White,harvard university,nationallab,,stasvojtenko2@gmail.com,gmail.com,0,0
265585,347899,tonhanks22,2021-12-02 15:10:46,2021-12-02 15:10:46,Tom Hanks,,Tom,,Hanks,chicago state university,precollegefacultystaff,,tonhanks22@gmail.com,gmail.com,0,0
265586,347900,dg88,2021-12-02 16:01:24,2021-12-02 16:02:18,dan green,,dan,,green,wright state university,industry,,dangreen1011@gmail.com,gmail.com,0,0
265587,347901,tripatr,2021-12-02 16:23:14,2021-12-02 16:23:14,Rahul Tripathi,,Rahul,,Tripathi,purdue university,precollegefacultystaff,,tripatr@purdue.edu,purdue.edu,0,0
265588,347903,-186875,2021-12-02 16:54:51,2021-12-02 16:54:51,Fazy khan,,Fazy,,khan,,,,-186875@invalid,invalid,0,-1


In [10]:
display(get_number_of_registered_users_for_yesterday(df))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [11]:
# get domain by nanoHUB organization field
df['organization'] = df.organization.astype('str').apply(str.lower)

df.loc[(df['organization']=='nan'),'organization'] = None

In [12]:
display(df.head())

Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,orgtype,orcid,email,email_host,block,sendEmail
0,998,hubrepo,2014-11-13 21:09:09,NaT,hubrepo hubrepo,,hubrepo,,hubrepo,none,,,nkissebe@gmail.com,gmail.com,0,0
1,1683,support,2008-11-19 22:51:04,2008-11-19 23:55:30,nanoHUB support,,nanoHUB,,support,purdue university,university,,support@nanohub.org,nanohub.org,0,0
2,1684,gridstat,2008-11-18 17:29:56,2020-02-14 18:50:14,Grid Statistics,,Grid,,Statistics,purdue university,universitystaff,,gridstat@nanohub.org,nanohub.org,0,0
3,1685,ncn,2008-11-11 19:17:04,NaT,NCN NCN,,NCN,,NCN,purdue university,,,ncn@nanohub.org,nanohub.org,0,0
4,1686,nanohub,2014-06-26 19:38:57,NaT,nanoHUB nanoHUB,,nanoHUB,,nanoHUB,purdue university,,,apps@nanohub.org,nanohub.org,0,0


In [13]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
s_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/saxenap/venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
import re

def clean_domain(this_domain_name):

    if not this_domain_name:
        return None

    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z]+", " ", this_domain_name.lower())
    name_list = this_domain_name.split(' ')

    # remove all stop words
    name_list = ['' if x in s_words else x for x in name_list]

    return set(filter(None, name_list))

In [15]:
# clean nanohub org name
df['org_cleaned_set'] = df.organization.apply(clean_domain)

# hash cleaned clean nanohub org name
df['org_cleaned_hash'] = df['org_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)


## Collaborative filtering for commerical email hosts

In [16]:
def predict_if_commerical(this_email_host_df):
    # predict if this email host is commerical
    top_cnt = this_email_host_df['org_cleaned_hash'].value_counts()

    if top_cnt.shape[0] > 0:
        if (top_cnt.iloc[0]/top_cnt.sum()) < 0.2:
            return True
        else:
            return False

    else:
        return False

is_email_commerical = df[['email_host', 'org_cleaned_hash']].groupby('email_host').apply(predict_if_commerical)

In [17]:
# mark commerical email addresses
df = pd.merge(df, is_email_commerical.reset_index(name='is_email_commerical'), how='left', left_on='email_host', right_on='email_host' ,)

In [18]:
display(get_number_of_registered_users_for_yesterday(df))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [19]:
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')

sql_query = """
SELECT 
    name AS domain_name, 
    domain, 
    industry, 
    `size range` as size, 
    country 
FROM wang159_myrmekes.companies_email_domain
;
"""
domain_df = pd.read_sql_query(sql_query, wang159_myrmekes_db)
display(domain_df.head())

Unnamed: 0,domain_name,domain,industry,size,country
0,strategic interns,strategicinterns.com,education management,1-10,india
1,bosque y comunidad,bosqueycomunidad.org,non-profit organization management,1-10,spain
2,alaric as,alaric.no,management consulting,1-10,
3,tws technology inc.,talkwithsam.co,computer software,1-10,
4,"cardinal strategies, llc",cardinal-strategies.com,civil engineering,11-50,united states


In [20]:
# make sure domain is unique and drop NaN
domain_df = domain_df.drop_duplicates(subset='domain', keep='first').dropna()

# clean domain name
domain_df['domain_cleaned_set'] = domain_df.domain_name.apply(clean_domain)

# hash cleaned domain name
domain_df['domain_cleaned_hash'] = domain_df['domain_cleaned_set'].apply(lambda x: '-'.join(sorted(list(x))) if x else None)

In [21]:
# obtain domain estimation from email addresses
df = pd.merge(df, domain_df[~domain_df.domain.isin(is_email_commerical[is_email_commerical].index)]['domain'], how='left', left_on='email_host', right_on='domain').rename(columns={'domain':'domain_by_email'})

# display
display(df.head())
display(df.tail())

Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,orgtype,orcid,email,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email
0,998,hubrepo,2014-11-13 21:09:09,NaT,hubrepo hubrepo,,hubrepo,,hubrepo,none,,,nkissebe@gmail.com,gmail.com,0,0,{none},none,False,
1,1683,support,2008-11-19 22:51:04,2008-11-19 23:55:30,nanoHUB support,,nanoHUB,,support,purdue university,university,,support@nanohub.org,nanohub.org,0,0,"{university, purdue}",purdue-university,False,
2,1684,gridstat,2008-11-18 17:29:56,2020-02-14 18:50:14,Grid Statistics,,Grid,,Statistics,purdue university,universitystaff,,gridstat@nanohub.org,nanohub.org,0,0,"{university, purdue}",purdue-university,False,
3,1685,ncn,2008-11-11 19:17:04,NaT,NCN NCN,,NCN,,NCN,purdue university,,,ncn@nanohub.org,nanohub.org,0,0,"{university, purdue}",purdue-university,False,
4,1686,nanohub,2014-06-26 19:38:57,NaT,nanoHUB nanoHUB,,nanoHUB,,nanoHUB,purdue university,,,apps@nanohub.org,nanohub.org,0,0,"{university, purdue}",purdue-university,False,


Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,orgtype,orcid,email,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email
265584,347898,richardwhite33,2021-12-02 15:07:51,2021-12-02 15:10:36,Richard White,,Richard,,White,harvard university,nationallab,,stasvojtenko2@gmail.com,gmail.com,0,0,"{university, harvard}",harvard-university,False,
265585,347899,tonhanks22,2021-12-02 15:10:46,2021-12-02 15:10:46,Tom Hanks,,Tom,,Hanks,chicago state university,precollegefacultystaff,,tonhanks22@gmail.com,gmail.com,0,0,"{chicago, university, state}",chicago-state-university,False,
265586,347900,dg88,2021-12-02 16:01:24,2021-12-02 16:02:18,dan green,,dan,,green,wright state university,industry,,dangreen1011@gmail.com,gmail.com,0,0,"{wright, state, university}",state-university-wright,False,
265587,347901,tripatr,2021-12-02 16:23:14,2021-12-02 16:23:14,Rahul Tripathi,,Rahul,,Tripathi,purdue university,precollegefacultystaff,,tripatr@purdue.edu,purdue.edu,0,0,"{university, purdue}",purdue-university,False,purdue.edu
265588,347903,-186875,2021-12-02 16:54:51,2021-12-02 16:54:51,Fazy khan,,Fazy,,khan,none,,,-186875@invalid,invalid,0,-1,{none},none,False,


## Fuzzy derive organization from nanoHUB profiles

In [22]:
# attempt direct join by hash
domain_subset_df = domain_df[domain_df.domain.isin(df.email_host.unique())][['domain', 'domain_cleaned_hash']]
domain_subset_df = domain_subset_df[~domain_subset_df.domain.isna()&~domain_subset_df.domain_cleaned_hash.isna()]

derived_df = pd.merge(df, domain_subset_df.rename(columns={'domain':'domain_by_profile'}),how='left', left_on='org_cleaned_hash', right_on='domain_cleaned_hash').drop('domain_cleaned_hash', axis=1)

# display
display(derived_df.sample(5))

Unnamed: 0,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,organization,...,orcid,email,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile
147140,173296,k0185123,2017-06-21 07:11:16,2017-06-22 13:55:41,Ethan Kao,,Ethan,,Kao,none,...,,ethanideas@livemail.tw,livemail.tw,0,0,{none},none,False,,
96863,107229,vasili_p,2014-09-03 12:44:02,2021-07-21 18:26:49,Vasili Perebeinos,,Vasili,,Perebeinos,none,...,,vasili_p@yahoo.com,yahoo.com,0,-1,{none},none,False,yahoo.com,
90371,98488,xgilik00,2014-02-26 09:01:13,2014-04-23 08:15:45,Ales G,,Ales,,G,none,...,,xgilik00@stud.feec.vutbr.cz,stud.feec.vutbr.cz,0,-1,{none},none,False,,
198370,260172,peonyquart90,2019-08-29 14:15:59,NaT,Sawyer Robertson,,Sawyer,,Robertson,none,...,,UfgrddzVzrirxfi@hotmail.com,hotmail.com,0,1,{none},none,False,,
106930,119381,ericgregg300,2015-03-03 15:45:43,2015-03-03 16:41:55,Eric Gregg,,Eric,,Gregg,none,...,,greg9910@bears.unco.edu,bears.unco.edu,0,-1,{none},none,False,,


In [23]:
# attempt to reduce domain_df size to speedup Jaccard calculation
from itertools import chain

all_word_set = set(chain.from_iterable(derived_df[(derived_df['domain_by_profile'].isna() & (~derived_df['organization'].isna()))].org_cleaned_set.values))

# select the domain_df entry with words from all_word_set
domain_subset_df = domain_df[domain_df.domain_cleaned_set.notnull()]
domain_subset_df = domain_subset_df[domain_subset_df.domain_cleaned_set.apply(lambda x: True if len(x.intersection(all_word_set)) > 0 else False)]

In [24]:
def get_org(this_df):
    # look at the cohort with this label, do they have a commonly agreed domain_by_email? If so, use it
    this_dbe_count = this_df.domain_by_email.value_counts()

    if this_dbe_count.shape[0] < 1:
        return None

    this_top_dbe_perc = this_dbe_count.iloc[0]/this_dbe_count.sum()

    # if top choice occupies majority of cases, then this cohort can be implied to this domain_by_email address
    if this_top_dbe_perc > 0.5:
        return this_dbe_count.index[0]


derived_hash = derived_df.groupby('org_cleaned_hash').apply(get_org)

In [25]:
derived_df = pd.merge(derived_df, derived_hash.reset_index(name='domain_by_profile_infer'), how='left', left_on='org_cleaned_hash', right_on='org_cleaned_hash')

In [26]:
# calculate a composite domain based on domain_by_email, domain_by_profile, and domain_by_profile_infer
def get_composite_domain(this_df):

    # if domain found via profile with direct match, use it
    if this_df.domain_by_profile != '':
        return this_df.domain_by_profile

    # if domain found via email, use it
    if this_df.domain_by_email != '':
        return this_df.domain_by_email

        # if domain found via profile but not with a direct match, use it
    if this_df.domain_by_profile_infer != '':
        return this_df.domain_by_profile_infer

    # else, no organization can be concluded, return none
    return None


derived_df['domain_final'] = derived_df[['domain_by_email', 'domain_by_profile', 'domain_by_profile_infer']].fillna('').apply(get_composite_domain, axis=1)

In [27]:
derived_df[derived_df.email_host == 'yahoo.com'][['domain_final', 'domain_by_email']]

Unnamed: 0,domain_final,domain_by_email
8,yahoo.com,yahoo.com
17,yahoo.com,yahoo.com
42,yahoo.com,yahoo.com
52,yahoo.com,yahoo.com
76,yahoo.com,yahoo.com
...,...,...
266650,yahoo.com,yahoo.com
266680,yahoo.com,yahoo.com
266686,csu.edu,yahoo.com
266766,yahoo.com,yahoo.com


## Update Salesforce institution

In [28]:
# get a unique list of organizations that appears in contact
#org_df = derived_df[['domain_name', 'domain', 'industry', 'size', 'country']].drop_duplicates().dropna()
org_df = pd.merge(derived_df[['domain_final']].drop_duplicates().dropna(), domain_df, how='inner', left_on='domain_final', right_on='domain')
org_df = org_df[['domain_name', 'domain', 'industry', 'size', 'country']]

# display
display(org_df.head(2))

Unnamed: 0,domain_name,domain,industry,size,country
0,purdue university,purdue.edu,higher education,10001+,united states
1,yahoo,yahoo.com,internet,10001+,united states


In [29]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here
df_sf['Name'] = org_df['domain_name']

df_sf['Size_Range__c'] = org_df['size'].fillna('Unknown')
df_sf['Country__c'] = org_df['country']
df_sf['Domain__c'] = org_df['domain']
df_sf['Industry__c'] = org_df['industry']

sf_original_fields = df_sf.columns

# display
display(df_sf.head())
display(df_sf.tail())

Unnamed: 0,Name,Size_Range__c,Country__c,Domain__c,Industry__c
0,purdue university,10001+,united states,purdue.edu,higher education
1,yahoo,10001+,united states,yahoo.com,internet
2,netzero inc,1-10,united states,netzero.net,computer software
3,georgetown university,5001-10000,united states,georgetown.edu,higher education
4,university of cincinnati,10001+,united states,uc.edu,higher education


Unnamed: 0,Name,Size_Range__c,Country__c,Domain__c,Industry__c
3712,loeb & loeb llp,501-1000,united states,loeb.com,law practice
3713,st. petersburg college,1001-5000,united states,spcollege.edu,higher education
3714,canadian nuclear laboratories,1001-5000,canada,cnl.ca,research
3715,noon,501-1000,united arab emirates,noon.com,internet
3716,skvare,1-10,united states,skvare.com,information technology and services


In [30]:


db_s.object_id = 'organization__c'
db_s.external_id = 'Domain__c'

In [31]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000bmnaXAAQ
hello
[Success] CSV upload successful. Job ID = 7505w00000bmnaXAAQ
[Success] Closing job successful. Job ID = 7505w00000bmnaXAAQ


In [32]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000bmnaXAAQ',
 'operation': 'upsert',
 'object': 'organization__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-12-02T17:02:37.000+0000',
 'systemModstamp': '2021-12-02T17:02:38.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'Domain__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [33]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''


## Obtain newly updated organization IDs from Salesforce

In [34]:
# create DB2 to Salesforce API object


In [35]:
# get Salesforce ID for organizations
sf_org_ID_df = db_s.query_data('SELECT Id, Domain__c FROM organization__c')

# display
sf_org_ID_df.head(3)

[Success] Bulk job creation successful. Job ID = 7505w00000bmnZ0AAI
{"id":"7505w00000bmnZ0AAI","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-12-02T17:02:39.000+0000","systemModstamp":"2021-12-02T17:02:39.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":0,"retries":0,"totalProcessingTime":0}
{"id":"7505w00000bmnZ0AAI","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-12-02T17:02:39.000+0000","systemModstamp":"2021-12-02T17:02:41.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":6508,"retries":0,"totalProcessingTime":548}
[Success] Bulk job completed successfully.


Unnamed: 0,Domain__c,Id
0,upenn.edu,a0r5w00000V42c0AAB
1,fer.hr,a0r5w00000V42c1AAB
2,wheatoncollege.edu,a0r5w00000V42c2AAB


In [36]:
# join salesforce ID back to contact DF
df = pd.merge(sf_org_ID_df, derived_df, how='right', left_on='Domain__c', right_on='domain_final').drop('Domain__c', axis=1).rename(columns={'Id':'Salesforce_org_ID'})

In [37]:
# display
display(df.head(2))

Unnamed: 0,Salesforce_org_ID,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,...,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,,998,hubrepo,2014-11-13 21:09:09,NaT,hubrepo hubrepo,,hubrepo,,hubrepo,...,gmail.com,0,0,{none},none,False,,,,
1,a0r5w00000V42cCAAR,1683,support,2008-11-19 22:51:04,2008-11-19 23:55:30,nanoHUB support,,nanoHUB,,support,...,nanohub.org,0,0,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu


## Match data with Salesforce format

In [38]:
display(get_number_of_registered_users_for_yesterday(df))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [39]:
# Make sure NaN and NaT values are taken care of here
df['raw_full_name'] = df.raw_full_name.str.strip()

display(df['raw_full_name'].notnull())

0         True
1         True
2         True
3         True
4         True
          ... 
267069    True
267070    True
267071    True
267072    True
267073    True
Name: raw_full_name, Length: 267074, dtype: bool

In [40]:
display(get_number_of_registered_users_for_yesterday(df))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [41]:
df = df[df['raw_full_name'].notnull()]

display(df.head())
display(df.tail())

Unnamed: 0,Salesforce_org_ID,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,...,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
0,,998,hubrepo,2014-11-13 21:09:09,NaT,hubrepo hubrepo,,hubrepo,,hubrepo,...,gmail.com,0,0,{none},none,False,,,,
1,a0r5w00000V42cCAAR,1683,support,2008-11-19 22:51:04,2008-11-19 23:55:30,nanoHUB support,,nanoHUB,,support,...,nanohub.org,0,0,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu
2,a0r5w00000V42cCAAR,1684,gridstat,2008-11-18 17:29:56,2020-02-14 18:50:14,Grid Statistics,,Grid,,Statistics,...,nanohub.org,0,0,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu
3,a0r5w00000V42cCAAR,1685,ncn,2008-11-11 19:17:04,NaT,NCN NCN,,NCN,,NCN,...,nanohub.org,0,0,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu
4,a0r5w00000V42cCAAR,1686,nanohub,2014-06-26 19:38:57,NaT,nanoHUB nanoHUB,,nanoHUB,,nanoHUB,...,nanohub.org,0,0,"{university, purdue}",purdue-university,False,,purdue.edu,purdue.edu,purdue.edu


Unnamed: 0,Salesforce_org_ID,user_id,username,registerDate,lastvisitDate,raw_full_name,title,first_name,middle_name,last_name,...,email_host,block,sendEmail,org_cleaned_set,org_cleaned_hash,is_email_commerical,domain_by_email,domain_by_profile,domain_by_profile_infer,domain_final
267069,a0r5w00000V42kEAAR,347898,richardwhite33,2021-12-02 15:07:51,2021-12-02 15:10:36,Richard White,,Richard,,White,...,gmail.com,0,0,"{university, harvard}",harvard-university,False,,harvard.edu,,harvard.edu
267070,a0r5w00000V42iaAAB,347899,tonhanks22,2021-12-02 15:10:46,2021-12-02 15:10:46,Tom Hanks,,Tom,,Hanks,...,gmail.com,0,0,"{chicago, university, state}",chicago-state-university,False,,csu.edu,,csu.edu
267071,a0r5w00000V42qYAAR,347900,dg88,2021-12-02 16:01:24,2021-12-02 16:02:18,dan green,,dan,,green,...,gmail.com,0,0,"{wright, state, university}",state-university-wright,False,,wright.edu,,wright.edu
267072,a0r5w00000V42cCAAR,347901,tripatr,2021-12-02 16:23:14,2021-12-02 16:23:14,Rahul Tripathi,,Rahul,,Tripathi,...,purdue.edu,0,0,"{university, purdue}",purdue-university,False,purdue.edu,purdue.edu,purdue.edu,purdue.edu
267073,,347903,-186875,2021-12-02 16:54:51,2021-12-02 16:54:51,Fazy khan,,Fazy,,khan,...,invalid,0,-1,{none},none,False,,,,


In [42]:
display(get_number_of_registered_users_for_yesterday(df))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [43]:
df_sf = pd.DataFrame()

df_sf['firstname']         = df['first_name']
# df_sf['Middle_name__c']                = df['middle_name']
df_sf['middlename']        = df['middle_name']
df_sf['lastname']          = df['last_name']
df_sf['Salutation']        = df['title'].apply(lambda s: s + '.' if s != None else s) 

display(df_sf.head())
display(df_sf.tail())

Unnamed: 0,firstname,middlename,lastname,Salutation
0,hubrepo,,hubrepo,
1,nanoHUB,,support,
2,Grid,,Statistics,
3,NCN,,NCN,
4,nanoHUB,,nanoHUB,


Unnamed: 0,firstname,middlename,lastname,Salutation
267069,Richard,,White,
267070,Tom,,Hanks,
267071,dan,,green,
267072,Rahul,,Tripathi,
267073,Fazy,,khan,


In [44]:
df_sf['nanoHUB_user_ID__c']            = df['user_id']
df_sf['nanoHUB_username__c']           = df['username']
df_sf['Email']                         = df['email'].fillna('').apply(lambda x: '' if '@invalid' in x else x).apply(lambda x: '' if '@' not in x else x)

# for sendEmail: 0 = opt-out, 1 = receive email. For salesforce HasOptedOutOfEmail, it's exact opposite
df_sf['HasOptedOutOfEmail']            = df['sendEmail'].apply(lambda x: 0 if x==1 else 1)
df_sf['nanoHUB_account_BLOCKED__c']    = df['block'].fillna(0)

# solidify time-related columns from datetime to string
df_sf['nanoHUB_registration_date__c']  = df["registerDate"].fillna('').dt.strftime('%Y-%m-%d')
df_sf['nanoHUB_last_active_date__c']   = df["lastvisitDate"].fillna('').dt.strftime('%Y-%m-%d') 

display(df_sf.head())
display(df_sf.tail())

Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c
0,hubrepo,,hubrepo,,998,hubrepo,nkissebe@gmail.com,1,0,2014-11-13,
1,nanoHUB,,support,,1683,support,support@nanohub.org,1,0,2008-11-19,2008-11-19
2,Grid,,Statistics,,1684,gridstat,gridstat@nanohub.org,1,0,2008-11-18,2020-02-14
3,NCN,,NCN,,1685,ncn,ncn@nanohub.org,1,0,2008-11-11,
4,nanoHUB,,nanoHUB,,1686,nanohub,apps@nanohub.org,1,0,2014-06-26,


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c
267069,Richard,,White,,347898,richardwhite33,stasvojtenko2@gmail.com,1,0,2021-12-02,2021-12-02
267070,Tom,,Hanks,,347899,tonhanks22,tonhanks22@gmail.com,1,0,2021-12-02,2021-12-02
267071,dan,,green,,347900,dg88,dangreen1011@gmail.com,1,0,2021-12-02,2021-12-02
267072,Rahul,,Tripathi,,347901,tripatr,tripatr@purdue.edu,1,0,2021-12-02,2021-12-02
267073,Fazy,,khan,,347903,-186875,,1,0,2021-12-02,2021-12-02


In [45]:
# sanity checks
display(df_sf[df_sf['nanoHUB_user_ID__c'].isnull()])
display(df_sf[df_sf['nanoHUB_username__c'].isnull()])
display(df_sf[df_sf['Email'].isnull()])
display(df_sf[df_sf['nanoHUB_registration_date__c'].isnull()])
display(df_sf[df_sf['nanoHUB_last_active_date__c'].isnull()])

Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c
0,hubrepo,,hubrepo,,998,hubrepo,nkissebe@gmail.com,1,0,2014-11-13,
3,NCN,,NCN,,1685,ncn,ncn@nanohub.org,1,0,2008-11-11,
4,nanoHUB,,nanoHUB,,1686,nanohub,apps@nanohub.org,1,0,2014-06-26,
5,Repository,,Manager,,1687,repo,repoman@nanohub.org,1,0,2007-11-13,
6,Violin,,,,1688,violin,dxu@cs.purdue.edu,1,0,2007-08-30,
...,...,...,...,...,...,...,...,...,...,...,...
267049,Uniswap,,Exchange,,347874,uniexchange,exchangeuniswap@gmail.com,1,0,2021-12-02,
267051,Leo,,Tsui,,347877,hcltsui,drleotsui@gmail.com,1,0,2021-12-02,
267052,Leo,,Tsui,,347877,hcltsui,drleotsui@gmail.com,1,0,2021-12-02,
267053,Inwizards,,Software,,347880,inwizards,seo.inwizards@gmail.com,1,0,2021-12-02,


In [46]:
display(get_number_of_registered_users_for_yesterday(df_sf, 'nanoHUB_registration_date__c'))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [47]:
# Tableau detailed view
df_sf['Detailed_user_timeline_to_Tableau__c'] = df_sf['Email'].apply(lambda x: 'https://tableauqa.itap.purdue.edu/views/profile/ProfileTimeline?Id%20Email='+x+'\
&:iframeSizedToWindow=true&:embed=y&:showAppBanner=false\
&:display_count=no&:showVizHome=no#6' if x != '' else '')

# nanoHUB user profile
df_sf['nanoHUB_user_page__c'] = df_sf['nanoHUB_user_ID__c'].apply(lambda x: 'https://nanohub.org/members/%d'%x if x != '' else '')
df_sf['Organization__c'] = df['organization'].fillna('')

df_sf['ORCID__c'] = df['orcid'].fillna('')

# derived information
df_sf['Organization_email_derived__c'] = df['Salesforce_org_ID'].fillna(' ')

sf_original_fields = df_sf.columns

# display
display(df_sf.head())
display(df_sf.tail())


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,Organization__c,ORCID__c,Organization_email_derived__c
0,hubrepo,,hubrepo,,998,hubrepo,nkissebe@gmail.com,1,0,2014-11-13,,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/998,none,,
1,nanoHUB,,support,,1683,support,support@nanohub.org,1,0,2008-11-19,2008-11-19,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/1683,purdue university,,a0r5w00000V42cCAAR
2,Grid,,Statistics,,1684,gridstat,gridstat@nanohub.org,1,0,2008-11-18,2020-02-14,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/1684,purdue university,,a0r5w00000V42cCAAR
3,NCN,,NCN,,1685,ncn,ncn@nanohub.org,1,0,2008-11-11,,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/1685,purdue university,,a0r5w00000V42cCAAR
4,nanoHUB,,nanoHUB,,1686,nanohub,apps@nanohub.org,1,0,2014-06-26,,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/1686,purdue university,,a0r5w00000V42cCAAR


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,Organization__c,ORCID__c,Organization_email_derived__c
267069,Richard,,White,,347898,richardwhite33,stasvojtenko2@gmail.com,1,0,2021-12-02,2021-12-02,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347898,harvard university,,a0r5w00000V42kEAAR
267070,Tom,,Hanks,,347899,tonhanks22,tonhanks22@gmail.com,1,0,2021-12-02,2021-12-02,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347899,chicago state university,,a0r5w00000V42iaAAB
267071,dan,,green,,347900,dg88,dangreen1011@gmail.com,1,0,2021-12-02,2021-12-02,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347900,wright state university,,a0r5w00000V42qYAAR
267072,Rahul,,Tripathi,,347901,tripatr,tripatr@purdue.edu,1,0,2021-12-02,2021-12-02,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/347901,purdue university,,a0r5w00000V42cCAAR
267073,Fazy,,khan,,347903,-186875,,1,0,2021-12-02,2021-12-02,,https://nanohub.org/members/347903,none,,


In [48]:
display(get_number_of_registered_users_for_yesterday(df_sf, 'nanoHUB_registration_date__c'))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

## To Salesforce Sales Cloud CRM

In [49]:

db_s.object_id = object_id
db_s.external_id = external_id

In [50]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000bmnHnAAI
hello
[Success] CSV upload successful. Job ID = 7505w00000bmnHnAAI
[Success] Closing job successful. Job ID = 7505w00000bmnHnAAI


In [51]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000bmnHnAAI',
 'operation': 'upsert',
 'object': 'Contact',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-12-02T17:02:58.000+0000',
 'systemModstamp': '2021-12-02T17:04:03.000+0000',
 'state': 'UploadComplete',
 'externalIdFieldName': 'nanoHUB_user_ID__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [52]:
# check status
from pprint import pprint

pprint(db_s.check_bulk_failed_results())

''


In [53]:
display(get_number_of_registered_users_for_yesterday(df_sf, 'nanoHUB_registration_date__c'))

[INFO] [1175540541 - task_user_basic_updates]: Number of users who registered on (2021-12-01) is: 73 [1175540541.get_number_of_registered_users_for:12]


73

In [54]:
display(df_sf.loc[df_sf['nanoHUB_user_ID__c'].isin(288227, 272933, 347639)])

Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,Organization__c,ORCID__c,Organization_email_derived__c
218249,Anurag,Kumar,Tiwari,Dr.,288227,tiwaria,tiwaria@nitj.ac.in,0,0,2020-05-18,2021-03-17,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/288227,nitjalandhar,,


Unnamed: 0,firstname,middlename,lastname,Salutation,nanoHUB_user_ID__c,nanoHUB_username__c,Email,HasOptedOutOfEmail,nanoHUB_account_BLOCKED__c,nanoHUB_registration_date__c,nanoHUB_last_active_date__c,Detailed_user_timeline_to_Tableau__c,nanoHUB_user_page__c,Organization__c,ORCID__c,Organization_email_derived__c
206494,Dr.,Manoj,Kumar Tiwari,,272933,manojjiin,manojjiin@gmail.com,1,0,2019-12-04,2019-12-04,https://tableauqa.itap.purdue.edu/views/profil...,https://nanohub.org/members/272933,none,,
