# DB2-Salesforce connector: Organization information updates

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
object_id = 'organization__c'
external_id = 'Domain__c'

from nanoHUB.application import Application

application = Application.get_instance()
nanohub_db = application.new_db_engine('nanohub')
wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')

salesforce = application.new_salesforce_engine()
db_s = salesforce



[1mnanoHUB - Serving Students, Researchers & Instructors[0m


Obtained Salesforce access token ...... True


In [2]:
import sys
#sys.path.append('/home/users/wang2506/nanohub_salesforce_integ/salesforce')

import pandas as pd
import datetime

## Obtain organization information from DB2 

In [3]:
# isolate data
i_char_df = pd.read_sql_query('select * from institution_characteristics', wang159_myrmekes_db)
i_msi_df = pd.read_sql_query('select * from MSI_status',wang159_myrmekes_db)
i_cls_df = pd.read_sql_query('select * from institution_classification', wang159_myrmekes_db)
i_cls_label_df = pd.read_sql_query('select * from institution_classification_labels', wang159_myrmekes_db)
i_cls_var_df = pd.read_sql_query('select * from institution_variable_labels', wang159_myrmekes_db)
usn_ranks_df = pd.read_sql_query('select * from us_news_rankings',wang159_myrmekes_db)

## Getting institution domain information

In [4]:
# get domain address from URL
import re

def get_domain(this_domain_name):

    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z-]+", ".", this_domain_name.lower())
    name_list = this_domain_name.split('.')

    # index of 'edu'
    try:
        edu_index = name_list.index('edu')
    except:
        return None
    
    if edu_index > 0:
        return name_list[edu_index-1]+'.edu'
    else:
        return None


unitid_url_df = i_char_df[['WEBADDR', 'UNITID']].copy()

unitid_url_df.loc[:, 'domain'] = unitid_url_df.WEBADDR.apply(get_domain)

# drop NaN rows
unitid_url_df.dropna(inplace=True)

In [5]:
display(unitid_url_df.head(1))
display(usn_ranks_df.head(1))

Unnamed: 0,WEBADDR,UNITID,domain
0,www.aamu.edu/,100654,aamu.edu


Unnamed: 0,index,UNITID,INSTNM,WEBADDR,usn_gr_ae,usn_gr_EE,chem_gr_anal,chem_gr_inorg,chem_gr_phy,chem_gr_bio,...,usn_gr_me,usn_gr_ce,usn_ug_eng_w_doct,usn_ug_EE_no_doct,usn_ug_EE_w_doct,phys_gr_con,usn_gr_eng,usn_natl_publ,usn_natl,Unnamed: 33
0,0,186131,Princeton University,www.princeton.edu,9.0,9.0,,13.0,11.0,,...,9.0,8.0,12.0,,13.0,8.0,21,,1,


In [6]:
## filtering institutional classification dataframe for 
## i_cls_df -> keep all because i_cls_labels_df may need them

usn_ranks_df = usn_ranks_df.drop(columns=['INSTNM','WEBADDR'])

In [7]:
# join domain into institution classification table by unitid
df = pd.merge(i_cls_df, unitid_url_df, how='inner', left_on='UNITID', right_on='UNITID')

# display
df.head(1)

Unnamed: 0,index,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,SATV25,SATM25,SATCMB25,SATACTEQ25,ACTCMP25,ACTFINAL,Unnamed95,Unnamed96,WEBADDR,domain
0,0,177834,A T Still University of Health Sciences,Kirksville,MO,52,25,25,25,25,...,0.0,0.0,0.0,0.0,0.0,0.0,,,WWW.ATSU.EDU,atsu.edu


In [8]:
## factoring in information from us news and world report
#cols_to_use = ['UNITID','NAME','CITY','STABBR','BASIC2018','domain'] t_df = t_df.drop(columns=['index'])
t_df = pd.merge(df,usn_ranks_df,left_on='UNITID',right_on='UNITID') 

display(t_df.head(1))

Unnamed: 0,index_x,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,usn_gr_me,usn_gr_ce,usn_ug_eng_w_doct,usn_ug_EE_no_doct,usn_ug_EE_w_doct,phys_gr_con,usn_gr_eng,usn_natl_publ,usn_natl,Unnamed: 33
0,0,177834,A T Still University of Health Sciences,Kirksville,MO,52,25,25,25,25,...,,,,,,,,,,


In [9]:
display(i_cls_label_df.head(1))
display(i_cls_label_df.groupby(by=['Variable']))
display(i_cls_label_df['Variable'].head(600))

Unnamed: 0,index,Variable,Value,Label
0,0,CC2000,-3.0,"Not classified, not in classification universe"


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ffa683a7b80>

0          CC2000
1          CC2000
2          CC2000
3          CC2000
4          CC2000
          ...    
265    DOCRESFLAG
266    DOCRESFLAG
267        ACTCAT
268        ACTCAT
269        ACTCAT
Name: Variable, Length: 270, dtype: object

In [10]:
def replace_with_label(this_df, c_df):
    #print(this_df.head(2))
    #print(c_df.head(2))
    
    #this_df = this_df['Variable'].to_frame()
    #print(this_df.head(2))
    
    this_var = this_df.Variable.iloc[0].strip()
    #this_var = this_df.iloc[0].strip()
    this_v_l = this_df[['Value', 'Label']]
    
    this_label = pd.merge(c_df[[this_var]], this_v_l, how='left', left_on=this_var, right_on='Value')
    c_df.loc[:, this_var] = this_label['Label']
    
    
i_cls_label_df.groupby(by=['Variable']).apply(lambda x: replace_with_label(x, t_df))

In [11]:
# display
i_cls_label_df.head(2)

Unnamed: 0,index,Variable,Value,Label
0,0,CC2000,-3.0,"Not classified, not in classification universe"
1,1,CC2000,15.0,Doctoral/Research Universities—Extensive


In [12]:
# On Salesforce side, domain is unique indentifier of each organization.
# For multiple institutions with same domain (univ. with satellite campuses), use entry with highest total enrollment (FALLENR17)

unique_domain_df = t_df.sort_values(by='FALLENR17', ascending=False).groupby('domain').head(1)

In [13]:
display(unique_domain_df.head(2))
print(unique_domain_df.loc[:,'NAME'])

Unnamed: 0,index_x,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,usn_gr_me,usn_gr_ce,usn_ug_eng_w_doct,usn_ug_EE_no_doct,usn_ug_EE_w_doct,phys_gr_con,usn_gr_eng,usn_natl_publ,usn_natl,Unnamed: 33
3687,3849,484613,University of Phoenix-Arizona,Tempe,AZ,"Not classified, not in classification universe","Not classified, not in classification universe",,Doctoral Universities: Moderate Research Activity,Doctoral/Professional Universities,...,,,,,,,,,,
3996,4171,433387,Western Governors University,Salt Lake City,UT,"Not classified, not in classification universe",,Master's L: Master's Colleges and Universities...,Master's Colleges & Universities: Larger Programs,Master's Colleges & Universities: Larger Programs,...,,,,,,,,,,


3687                        University of Phoenix-Arizona
3996                         Western Governors University
3095                    Southern New Hampshire University
1399                              Grand Canyon University
1653                           Ivy Tech Community College
                              ...                        
501                          California Christian College
2160    Monteclaro Escuela de Hoteleria y Artes Culina...
1881                Los Angeles Academy of Figurative Art
3411    Theological Seminary of the Reformed Episcopal...
1451                         Harrington College of Design
Name: NAME, Length: 3342, dtype: object


In [14]:
## adding the MSI breakdown information
i_msi_df = i_msi_df.iloc[1:,1:3]

# split i_msi into columns with Yes or No indicators
# have MSI, HSI, HCBU, PBI, TCU, NASNTI, ANNH, AANAPISI
i_msi_df['MSI'] = 'Yes'
i_msi_df['TCU'] = 'No'
i_msi_df['AANAPISI'] = 'No'
i_msi_df['HBCU'] = 'No'
i_msi_df['ANNH'] = 'No'
i_msi_df['NASNTI'] = 'No'
i_msi_df['PBI'] = 'No'
i_msi_df['HSI'] = 'No'

i_msi_df['MSI2'] = i_msi_df['2019 List of Minority Serving Institutions (MSIs)']
i_msi_df = i_msi_df.drop(columns=['2019 List of Minority Serving Institutions (MSIs)'])

i_msi_df['Name'] = i_msi_df['Unnamed: 1']
i_msi_df = i_msi_df.drop(columns=['Unnamed: 1'])


display(i_msi_df.head(1))


Unnamed: 0,MSI,TCU,AANAPISI,HBCU,ANNH,NASNTI,PBI,HSI,MSI2,Name
1,Yes,No,No,No,No,No,No,No,TCU,Aaniiih Nakoda College


In [15]:
print(i_msi_df.iloc[682,:]) #685
print(i_msi_df.iloc[0,:]) #3

MSI                                      Yes
TCU                                       No
AANAPISI                                  No
HBCU                                      No
ANNH                                      No
NASNTI                                    No
PBI                                       No
HSI                                       No
MSI2                          AANAPISI & HSI
Name        University of California, Merced
Name: 683, dtype: object
MSI                            Yes
TCU                             No
AANAPISI                        No
HBCU                            No
ANNH                            No
NASNTI                          No
PBI                             No
HSI                             No
MSI2                           TCU
Name        Aaniiih Nakoda College
Name: 1, dtype: object


In [16]:
## extract msi subcategories
msi_cats = i_msi_df['MSI2'].apply(lambda x: x.split()).apply(lambda x: [i for i in x if i != '&'])

for index,vals in enumerate(msi_cats):
    for item in vals:
        i_msi_df.loc[index+1,item] = 'Yes'

In [17]:
msi_df = i_msi_df.drop(columns=['MSI2'])
display(msi_df.tail(2))

Unnamed: 0,MSI,TCU,AANAPISI,HBCU,ANNH,NASNTI,PBI,HSI,Name
773,Yes,No,No,No,No,No,No,Yes,Yakima Valley College
774,Yes,No,Yes,No,No,No,No,Yes,Yuba College


In [18]:
# regex determination of institutional names
## re-arrange names for both unique_domain_df and msi_df
import re
pattern = re.compile(r'\w+')

## msi_df
for index,item in enumerate(msi_df['Name']):
    msi_df['Name'][index+1] = ' '.join(pattern.findall(item))

## unique domain df
for item in unique_domain_df.index.values.tolist():
    unique_domain_df['NAME'][item] = ' '.join(pattern.findall(unique_domain_df['NAME'][item]))

    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_domain_df['NAME'][item] = ' '.join(pattern.findall(unique_domain_df['NAME'][item]))


In [19]:
unique_domain_df.head(2)

Unnamed: 0,index_x,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,usn_gr_me,usn_gr_ce,usn_ug_eng_w_doct,usn_ug_EE_no_doct,usn_ug_EE_w_doct,phys_gr_con,usn_gr_eng,usn_natl_publ,usn_natl,Unnamed: 33
3687,3849,484613,University of Phoenix Arizona,Tempe,AZ,"Not classified, not in classification universe","Not classified, not in classification universe",,Doctoral Universities: Moderate Research Activity,Doctoral/Professional Universities,...,,,,,,,,,,
3996,4171,433387,Western Governors University,Salt Lake City,UT,"Not classified, not in classification universe",,Master's L: Master's Colleges and Universities...,Master's Colleges & Universities: Larger Programs,Master's Colleges & Universities: Larger Programs,...,,,,,,,,,,


In [20]:
print(msi_df.iloc[257,:])
print(msi_df.iloc[682,:])

MSI                                                Yes
TCU                                                 No
AANAPISI                                            No
HBCU                                               Yes
ANNH                                                No
NASNTI                                              No
PBI                                                 No
HSI                                                 No
Name        Florida Agricultural Mechanical University
Name: 258, dtype: object
MSI                                     Yes
TCU                                      No
AANAPISI                                Yes
HBCU                                     No
ANNH                                     No
NASNTI                                   No
PBI                                      No
HSI                                     Yes
Name        University of California Merced
Name: 683, dtype: object


In [21]:
# add msi information onto the unique_domain_df
unique_domain_df = unique_domain_df.drop(columns=['HBCU','MSI','HSI'])

unique_domain_df = unique_domain_df.set_index('NAME').join(msi_df.set_index('Name'))
### merge behaves weirdly, used join instead
#unique_domain_df = unique_domain_df.merge(msi_df,left_on='NAME',right_on='Name')

unique_domain_df['NAME'] = unique_domain_df.index.values
display(unique_domain_df.head(2))

Unnamed: 0,index_x,UNITID,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,IPUG2018,...,Unnamed: 33,MSI,TCU,AANAPISI,HBCU,ANNH,NASNTI,PBI,HSI,NAME
A T Still University of Health Sciences,0,177834,Kirksville,MO,Specialized Institutions—Medical schools and m...,,Spec/Med: Special Focus Institutions--Medical ...,Special Focus Four-Year: Medical Schools & Cen...,Special Focus Four-Year: Medical Schools & Cen...,Not Classified (Exclusively Graduate Programs),...,,,,,,,,,,A T Still University of Health Sciences
AOMA Graduate School of Integrative Medicine,152,429094,Austin,TX,Specialized Institutions—Other separate health...,,Spec/Health: Special Focus Institutions--Other...,Special Focus Four-Year: Other Health Professi...,Special Focus Four-Year: Other Health Professi...,Not Classified (Exclusively Graduate Programs),...,,,,,,,,,,AOMA Graduate School of Integrative Medicine


In [22]:
unique_domain_df.loc['University of California Merced',:]

index_x                                              3714
UNITID                                             445188
CITY                                               Merced
STABBR                                                 CA
CC2000     Not classified, not in classification universe
                                ...                      
ANNH                                                   No
NASNTI                                                 No
PBI                                                    No
HSI                                                   Yes
NAME                      University of California Merced
Name: University of California Merced, Length: 137, dtype: object

## Obtain organization metadata from Salesforce

In [23]:

# specify Salesforce object ID and external ID
org_metadata = db_s.get_obj_metadata('Organization__c')

In [24]:
import json

# get all fields
field_list = json.loads(org_metadata)['fields']

# list of available field API names
available_fields = list()

for this_field in field_list:
    this_name = this_field['name']
    
    if len(this_name) > 3:
        if this_name[-3:] == '__c':
            # this is a custom-made field
            available_fields.append(this_name[:-3])

# display
display(available_fields)

['Domain',
 'Country',
 'Industry',
 'Size_Range',
 'MSI',
 'is_US_institution',
 'HBCU',
 'TCU',
 'BASIC2018',
 'usn_gr_eng',
 'usn_gr_EE',
 'usn_gr_mse',
 'usn_gr_ce',
 'usn_gr_me',
 'usn_gr_ae',
 'usn_gr_IE',
 'chem_gr_anal',
 'chem_gr_inorg',
 'chem_gr_phy',
 'chem_gr_bio',
 'chem_gr_org',
 'chem_gr_theo',
 'phys_gr_atom',
 'phys_gr_con',
 'phys_gr_quan',
 'usn_ug_eng_w_doct',
 'usn_ug_eng_no_doctorate',
 'usn_ug_EE_w_doct',
 'usn_ug_EE_no_doct',
 'usn_ug_CE_w_doct',
 'usn_ug_CE_no_doct',
 'usn_ug_ME_w_doct',
 'usn_ug_ME_no_doct',
 'usn_ug_ChE_w_doct',
 'usn_ug_ChE_no_doct',
 'usn_ug_IE_w_doct',
 'usn_ug_IE_no_doct',
 'usn_ug_MSE_w_doct',
 'usn_natl',
 'usn_natl_publ',
 'usn_ug_MSE_no_doct',
 'ANNH',
 'AANAPISI',
 'PBI',
 'HSI',
 'NASNTI',
 'Max_Cluster_Size',
 'Median_Cluster_Size',
 'Avg_Cluster_Size',
 'Avg_Class_Length',
 'Median_Class_Length',
 'Total_Clustered_Users',
 'Avg_Week_Classes',
 'Max_Week_Classes',
 'Avg_Week_Users',
 'Max_Week_Users',
 'Avg_Sem_Classes',
 'Max_Sem

## Match data with Salesforce format

In [25]:
# find columns that have matched fields in Salesforce
matched_columns = set(unique_domain_df.columns).intersection(set(available_fields))

# display
display(matched_columns)

{'AANAPISI',
 'ANNH',
 'BASIC2018',
 'HBCU',
 'HSI',
 'MSI',
 'NASNTI',
 'PBI',
 'TCU',
 'chem_gr_anal',
 'chem_gr_bio',
 'chem_gr_inorg',
 'chem_gr_org',
 'chem_gr_phy',
 'chem_gr_theo',
 'phys_gr_atom',
 'phys_gr_con',
 'phys_gr_quan',
 'usn_gr_EE',
 'usn_gr_IE',
 'usn_gr_ae',
 'usn_gr_ce',
 'usn_gr_eng',
 'usn_gr_me',
 'usn_gr_mse',
 'usn_natl',
 'usn_natl_publ',
 'usn_ug_CE_w_doct',
 'usn_ug_ChE_no_doct',
 'usn_ug_ChE_w_doct',
 'usn_ug_EE_no_doct',
 'usn_ug_EE_w_doct',
 'usn_ug_IE_no_doct',
 'usn_ug_IE_w_doct',
 'usn_ug_ME_no_doct',
 'usn_ug_ME_w_doct',
 'usn_ug_MSE_w_doct',
 'usn_ug_eng_no_doctorate',
 'usn_ug_eng_w_doct'}

In [26]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here

df_sf['Domain__c'] = unique_domain_df['domain']

df_sf['is_US_institution__c'] = True
df_sf['Name'] = unique_domain_df['NAME']

# Transfer all matched columns to df_sf
for this_col in matched_columns:
    df_sf[this_col+'__c'] = unique_domain_df[this_col]

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,A T Still University of Health Sciences,AOMA Graduate School of Integrative Medicine
Domain__c,atsu.edu,aoma.edu
is_US_institution__c,True,True
Name,A T Still University of Health Sciences,AOMA Graduate School of Integrative Medicine
usn_gr_ae__c,,
usn_gr_eng__c,,
usn_natl_publ__c,,
usn_ug_CE_w_doct__c,,
usn_ug_EE_w_doct__c,,
usn_gr_ce__c,,
usn_ug_ChE_w_doct__c,,


## To Salesforce Sales Cloud CRM

In [27]:
db_s = salesforce

# specify Salesforce object ID and external ID
db_s.object_id = object_id
db_s.external_id = external_id

In [28]:
display(df_sf.head(2))

Unnamed: 0,Domain__c,is_US_institution__c,Name,usn_gr_ae__c,usn_gr_eng__c,usn_natl_publ__c,usn_ug_CE_w_doct__c,usn_ug_EE_w_doct__c,usn_gr_ce__c,usn_ug_ChE_w_doct__c,...,AANAPISI__c,chem_gr_theo__c,usn_gr_EE__c,PBI__c,chem_gr_phy__c,usn_ug_ChE_no_doct__c,chem_gr_inorg__c,usn_ug_eng_w_doct__c,usn_gr_me__c,phys_gr_con__c
A T Still University of Health Sciences,atsu.edu,True,A T Still University of Health Sciences,,,,,,,,...,,,,,,,,,,
AOMA Graduate School of Integrative Medicine,aoma.edu,True,AOMA Graduate School of Integrative Medicine,,,,,,,,...,,,,,,,,,,


In [29]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000b2D4mAAE
hello


[Success] CSV upload successful. Job ID = 7505w00000b2D4mAAE


[Success] Closing job successful. Job ID = 7505w00000b2D4mAAE


In [30]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000b2D4mAAE',
 'operation': 'upsert',
 'object': 'organization__c',
 'createdById': '0055w00000DM5bOAAT',
 'createdDate': '2021-10-25T11:16:23.000+0000',
 'systemModstamp': '2021-10-25T11:16:25.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'Domain__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [31]:
from pprint import pprint
pprint(db_s.check_bulk_failed_results())

''
