# Reclassify user/contact organizations

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
object_id = 'Contact'
external_id = 'nanoHUB_user_ID__c'

# login parameters to be handled by Papermill
from nanoHUB.pipeline.application import Application

application = Application.get_instance()


salesforce = application.new_salesforce_engine()
db_s = salesforce


In [2]:
import pandas as pd
import datetime

## Obtain user org classifications from Salesforce

In [3]:
# get Salesforce ID for organizations
contact_org_df = db_s.query_data('SELECT Id, nanoHUB_user_ID__c, Email, Organization_composite__c,\
Organization_email_derived__c, Organization_Conflict__c, \
Organization__c FROM Contact Order by nanoHUB_user_ID__c DESC limit 5000') # limit 5000

#Organization_citation_derived__c

# display
contact_org_df.head(3)

Obtained Salesforce access token ...... True
[Success] Bulk job creation successful. Job ID = 7505w00000TmKWdAAN
{"id":"7505w00000TmKWdAAN","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T08:30:04.000+0000","systemModstamp":"2021-01-21T08:30:04.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000TmKWdAAN","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T08:30:04.000+0000","systemModstamp":"2021-01-21T08:30:15.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":5000,"retries":0,"totalProcessingTime":5452}
[Success] Bulk job completed successfully.


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,0035w00003EE0MoAAL,False,,,,
1,asdf1234@invalidaddress5.org,0035w00003DA2NpAAL,False,,,,
2,defnitelynotanemail7193@purdue.edu,0035w00003DA2MmAAL,False,,,,


In [4]:
# pull all the organizations from SF to get their unique tokens
all_org_df = db_s.query_data('SELECT Id, Name, Domain__c FROM Organization__c')

display(all_org_df.head(2))
print(all_org_df.shape)

[Success] Bulk job creation successful. Job ID = 7505w00000TmKINAA3
{"id":"7505w00000TmKINAA3","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T08:30:16.000+0000","systemModstamp":"2021-01-21T08:30:16.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000TmKINAA3","operation":"query","object":"organization__c","createdById":"0055w00000DM5bOAAT","createdDate":"2021-01-21T08:30:16.000+0000","systemModstamp":"2021-01-21T08:30:20.000+0000","state":"JobComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":6365,"retries":0,"totalProcessingTime":481}
[Success] Bulk job completed successfully.


Unnamed: 0,Domain__c,Id,Name
0,upenn.edu,a0r5w00000V42c0AAB,university of pennsylvania
1,fer.hr,a0r5w00000V42c1AAB,fer


(6365, 3)


In [5]:
## extract the domain from contact_org_df
user_domains = contact_org_df['Email'].fillna(' ').apply(lambda x: x.split('@')[-1])
display(user_domains.head(2))

0            arizona.edu
1    invalidaddress5.org
Name: Email, dtype: object

In [6]:
## if contact domain is a commercial email domain, then find the index and empty out the entry 
import commercial_domains as cdomains
user_domains = user_domains.apply(lambda x: ' ' if x in cdomains.domains else x)
display(user_domains.head(2))

0            arizona.edu
1    invalidaddress5.org
Name: Email, dtype: object

In [7]:
## finding indexes in contact_org_df and zero out Organization_email_derived__c
indices = [i for i, x in enumerate(user_domains.to_list()) if x == ' ']

contact_org_df['Organization_email_derived__c'][indices] = ' '

## list comprehension method
display(contact_org_df.head(5))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,0035w00003EE0MoAAL,False,,,,
1,asdf1234@invalidaddress5.org,0035w00003DA2NpAAL,False,,,,
2,defnitelynotanemail7193@purdue.edu,0035w00003DA2MmAAL,False,,,,
3,lbaker@kellpartners.com,0035w000034KEj2AAG,False,,,,
4,amy.morgan@northwestern.edu,0035w00003BSD5mAAH,False,,,,


In [8]:
## match/find all_org_df ID values
nh_orgs = contact_org_df['Organization__c'].fillna(' ').apply(lambda x: x.lower())
sf_org_strings = all_org_df['Name'].fillna(' ').apply(lambda x: x.lower()).to_list()
sf_org_ids = all_org_df['Id'].fillna(' ')

In [9]:
# continued from above snippet
indices = [i for i, x in enumerate(nh_orgs.to_list()) if x != ' ']

#check if organization already has a SF entry
for i in indices:
    if nh_orgs[i] in sf_org_strings: #if exact match
        t_index = sf_org_strings.index(nh_orgs[i]) #pull exact id
        nh_orgs[i] = sf_org_ids[t_index]
    else:
        nh_orgs[i] = ' '
        
#    else:
#        #calculate the damerau levenshtein distance
#        min_dl_dist,org_index = dl_dist(nh_orgs[i],sf_org_strings)
#        if min_dl_dist <= 4: #fit the org to the org_index 
#            t_index = org_index
#            nh_orgs[i] = sf_org_ids[t_index]
        ## create new entry is involved... create a new DF, populate with new org entries, send to SF
        ## pull SF assigned new org ID back, then repopulate the contacts SF DF with new org ID only for those new orgs 
        #else: #create new entry
        #    t_index = org_index

In [10]:
display(nh_orgs.head(3))

0     
1     
2     
Name: Organization__c, dtype: object

In [11]:
## Compare and readjust the organization_email_derived__c based on email domains
email_orgs = contact_org_df['Organization_email_derived__c']
emails = contact_org_df['Email'].fillna(' ').apply(lambda x: x.split('@')[-1])

In [12]:
## all sf domains
sf_domains = all_org_df['Domain__c'].to_list()

In [13]:
display(contact_org_df.head(2))
display(contact_org_df.tail(2))

Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,0035w00003EE0MoAAL,False,,,,
1,asdf1234@invalidaddress5.org,0035w00003DA2NpAAL,False,,,,


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
4998,wendy.barnard@asu.edu,0035w00003HVpRPAA1,False,,a0r5w00000V42jDAAR,a0r5w00000V42jDAAR,307095.0
4999,eliaskaroui@gmail.com,0035w00003HVpuvAAD,False,,,,307093.0


In [14]:
## verify that the email_org entry corresponds to the correct organizational domain
for i,j in enumerate(emails.to_list()):
    ## apply fixes for mail.usf.edu to usf.edu - i.e., use the simplest derivative of the domains    
    j_test = j.split('.')
    if len(j_test) >= 3:
        j = '.'.join(j_test[-2:])
    
    if j in sf_domains:
        # pull the corresponding org Id
        j_org = all_org_df['Id'][sf_domains.index(j)]
        contact_org_df['Organization_email_derived__c'][i] = j_org
        
display(contact_org_df.head(2))
display(contact_org_df.tail(2))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,0035w00003EE0MoAAL,False,,,a0r5w00000V433WAAR,
1,asdf1234@invalidaddress5.org,0035w00003DA2NpAAL,False,,,,


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
4998,wendy.barnard@asu.edu,0035w00003HVpRPAA1,False,,a0r5w00000V42jDAAR,a0r5w00000V42jDAAR,307095.0
4999,eliaskaroui@gmail.com,0035w00003HVpuvAAD,False,,,,307093.0


In [15]:
## Compare the SF indexes of NH determined organization against that of Organization_email_derived__c
email_orgs = contact_org_df['Organization_email_derived__c']

for i,j in enumerate(nh_orgs.to_list()):
    if j != ' ' and email_orgs[i] == ' ':
        # nanohub org overwrites all other org flags
        contact_org_df['Organization_composite__c'][i] = j
    elif j != ' ' and j != email_orgs[i]:
        contact_org_df['Organization_Conflict__c'][i] = True
        contact_org_df['Organization_composite__c'][i] = j

    elif j != ' ' and j == email_orgs[i]:
        contact_org_df['Organization_Conflict__c'][i] = False
        contact_org_df['Organization_composite__c'][i] = j

        
display(contact_org_df.head(4))
display(contact_org_df.tail(4))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://

Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,0035w00003EE0MoAAL,False,,,a0r5w00000V433WAAR,
1,asdf1234@invalidaddress5.org,0035w00003DA2NpAAL,False,,,,
2,defnitelynotanemail7193@purdue.edu,0035w00003DA2MmAAL,False,,,a0r5w00000V42cCAAR,
3,lbaker@kellpartners.com,0035w000034KEj2AAG,False,,,,


Unnamed: 0,Email,Id,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
4996,subrata.niser@gmail.com,0035w00003HVpuxAAD,False,,,,307097.0
4997,,0035w00003HVpuwAAD,False,,,,307096.0
4998,wendy.barnard@asu.edu,0035w00003HVpRPAA1,False,,a0r5w00000V42jDAAR,a0r5w00000V42jDAAR,307095.0
4999,eliaskaroui@gmail.com,0035w00003HVpuvAAD,False,,,,307093.0


In [16]:
contact_org_df = contact_org_df.drop(columns='Id')

In [17]:
display(contact_org_df.head(2))

Unnamed: 0,Email,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,False,,,a0r5w00000V433WAAR,
1,asdf1234@invalidaddress5.org,False,,,,


In [18]:
## fixing commercial email hosts - zero out both Organization_composite__c and Organization_email_derived__c  
indices = [i for i, x in enumerate(user_domains.to_list()) if x == ' ']

#contact_org_df['Organization_composite__c'][indices] = ' '
contact_org_df['Organization_email_derived__c'][indices] = ' '


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [19]:
## all_org_df - find all commercial names that correspond to a commerical email domain
nh_domains = all_org_df['Domain__c'].to_list()
nh_cdi = [] #cdi = commercial domain indexes

for i,j in enumerate(nh_domains):
    if j in cdomains.domains:
        nh_cdi.append(i)


In [20]:
nh_c_all = all_org_df.iloc[nh_cdi,:]

display(nh_c_all.head(2))
nh_c_all = nh_c_all['Id'].to_list()

Unnamed: 0,Domain__c,Id,Name
13,yahoo.com,a0r5w00000V42iXAAR,yahoo
24,aol.com,a0r5w00000V42iiAAB,aol


In [21]:
## go through every single composite organization, and ensure that they are all non-commercial
composite_org_verify = contact_org_df['Organization_composite__c'][indices].to_list()

for i,j in enumerate(composite_org_verify):
    if j in nh_c_all:
        contact_org_df.iloc[i,3] = ' '


In [22]:
display(contact_org_df.head(100))

Unnamed: 0,Email,Organization_Conflict__c,Organization__c,Organization_composite__c,Organization_email_derived__c,nanoHUB_user_ID__c
0,lisagundy@arizona.edu,False,,,a0r5w00000V433WAAR,
1,asdf1234@invalidaddress5.org,False,,,,
2,defnitelynotanemail7193@purdue.edu,False,,,a0r5w00000V42cCAAR,
3,lbaker@kellpartners.com,False,,,,
4,amy.morgan@northwestern.edu,False,,,a0r5w00000V42ikAAB,
...,...,...,...,...,...,...
95,nada_kh98@hotmail.com,False,,,,313344.0
96,maldacena1984@gmail.com,False,,,,313342.0
97,sssd.john@gmail.com,False,,,,313341.0
98,,False,,,,313340.0


## To Salesforce Sales Cloud CRM

In [23]:

# create DB2 to Salesforce API object
db_s = salesforce

# specify Salesforce object ID and external ID
db_s.object_id = object_id
db_s.external_id = external_id

Obtained Salesforce access token ...... True


In [24]:
# send data to Salesforce
db_s.send_data(contact_org_df)

[Success] Bulk job creation successful. Job ID = 7505w00000TmKAWAA3
hello
[Success] CSV upload successful. Job ID = 7505w00000TmKAWAA3
[Success] Closing job successful. Job ID = 7505w00000TmKAWAA3


In [25]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000TmKAWAA3',
 'operation': 'upsert',
 'object': 'Contact',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-01-21T08:30:30.000+0000',
 'systemModstamp': '2021-01-21T08:30:31.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'nanoHUB_user_ID__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [26]:
from pprint import pprint
pprint(db_s.check_bulk_failed_results())

''
