# DB2-Salesforce connector: Organization information updates

In [1]:
# API settings
api_url = '/services/data/v43.0/sobjects'
object_id = 'organization__c'
external_id = 'Domain__c'

# login parameters to be handled by Papermill
sf_login_params = None 
sql_login_params = None

In [3]:
import sys
sys.path.append(lib_dir)

import pandas as pd
import datetime

## Obtain tool information from DB2 

In [4]:
# connect with DB2
import sqlalchemy as sql

engine = sql.create_engine('mysql+pymysql://%s:%s@127.0.0.1/wang159_myrmekes' \
                                               %(sql_login_params['username'], sql_login_params['password']))

In [5]:
i_char_df = pd.read_sql_query('select * from institution_characteristics', engine)
i_cls_df = pd.read_sql_query('select * from institution_classification', engine)
i_cls_label_df = pd.read_sql_query('select * from institution_classification_labels', engine)
i_cls_var_df = pd.read_sql_query('select * from institution_variable_labels', engine)

In [6]:
# get domain address from URL
import re

def get_domain(this_domain_name):

    # Replace all non-alphanumeric characters with space
    this_domain_name = re.sub("[^0-9a-zA-Z-]+", ".", this_domain_name.lower())
    name_list = this_domain_name.split('.')
        
    # index of 'edu'
    try:
        edu_index = name_list.index('edu')
    except:
        return None
    
    if edu_index > 0:
        return name_list[edu_index-1]+'.edu'
    else:
        return None

    

unitid_url_df = i_char_df[['WEBADDR', 'UNITID']].copy()

unitid_url_df.loc[:, 'domain'] = unitid_url_df.WEBADDR.apply(get_domain)

# drop NaN rows
unitid_url_df.dropna(inplace=True)

In [7]:
#display
unitid_url_df.head(3)

Unnamed: 0,WEBADDR,UNITID,domain
0,www.aamu.edu/,100654,aamu.edu
1,www.uab.edu,100663,uab.edu
2,www.amridgeuniversity.edu,100690,amridgeuniversity.edu


In [8]:
# join domain into institution classification table by unitid
df = pd.merge(i_cls_df, unitid_url_df, how='inner', left_on='UNITID', right_on='UNITID')

# display
df.head(3)

Unnamed: 0,index,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,SATV25,SATM25,SATCMB25,SATACTEQ25,ACTCMP25,ACTFINAL,Unnamed95,Unnamed96,WEBADDR,domain
0,0,177834,A T Still University of Health Sciences,Kirksville,MO,52,25,25,25,25,...,0.0,0.0,0.0,0.0,0.0,0.0,,,WWW.ATSU.EDU,atsu.edu
1,1,180203,Aaniiih Nakoda College,Harlem,MT,60,33,33,33,33,...,0.0,0.0,0.0,0.0,0.0,0.0,,,www.ancollege.edu,ancollege.edu
2,2,222178,Abilene Christian University,Abilene,TX,21,19,18,18,18,...,510.0,515.0,1025.0,19.0,21.0,20.054054,,,www.acu.edu,acu.edu


In [11]:
def replace_with_label(this_df, c_df):
    this_var = this_df.Variable.iloc[0].strip()
    this_v_l = this_df[['Value', 'Label']]
    
    this_label = pd.merge(c_df[[this_var]], this_v_l, how='left', left_on=this_var, right_on='Value')
    c_df.loc[:, this_var] = this_label['Label']
    
    
i_cls_label_df.groupby('Variable').apply(lambda x: replace_with_label(x, df))

In [12]:
# display
df.head(2)

Unnamed: 0,index,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,...,SATV25,SATM25,SATCMB25,SATACTEQ25,ACTCMP25,ACTFINAL,Unnamed95,Unnamed96,WEBADDR,domain
0,0,177834,A T Still University of Health Sciences,Kirksville,MO,Specialized Institutions—Medical schools and m...,,Spec/Med: Special Focus Institutions--Medical ...,Special Focus Four-Year: Medical Schools & Cen...,Special Focus Four-Year: Medical Schools & Cen...,...,0.0,0.0,0.0,0.0,0.0,0.0,,,WWW.ATSU.EDU,atsu.edu
1,1,180203,Aaniiih Nakoda College,Harlem,MT,Tribal colleges and universities,,Tribal: Tribal Colleges,Tribal Colleges,Tribal Colleges,...,0.0,0.0,0.0,0.0,0.0,0.0,,,www.ancollege.edu,ancollege.edu


In [13]:
# On Salesforce side, domain is unique indentifier of each organization.
# For multiple institutions with same domain (univ. with satellite campuses), use entry with highest total enrollment (FALLENR17)

unique_domain_df = df.sort_values(by='FALLENR17', ascending=False).groupby('domain').head(1)

## Obtain organization metadata from Salesforce

In [25]:
from DB2SalesforceAPI import DB2SalesforceAPI

# create DB2 to Salesforce API object
db_s = DB2SalesforceAPI(sf_login_params)

# specify Salesforce object ID and external ID
org_metadata = db_s.get_obj_metadata('Organization__c')

Obtained Salesforce access token ...... True


In [26]:
import json

# get all fields
field_list = json.loads(org_metadata)['fields']

# list of available field API names
available_fields = list()

for this_field in field_list:
    this_name = this_field['name']
    
    if len(this_name) > 3:
        if this_name[-3:] == '__c':
            # this is a custom-made field
            available_fields.append(this_name[:-3])
            
# display
display(available_fields)

['Domain',
 'Country',
 'Industry',
 'Size_Range',
 'MSI',
 'is_US_institution',
 'HBCU',
 'BASIC2018']

## Match data with Salesforce format

In [27]:
# find columns that have matched fields in Salesforce
matched_columns = set(unique_domain_df.columns).intersection(set(available_fields))

# display
display(matched_columns)

{'BASIC2018', 'HBCU', 'MSI'}

In [28]:
df_sf = pd.DataFrame()

# Make sure NaN and NaT values are taken care of here

df_sf['Domain__c'] = unique_domain_df['domain']

df_sf['is_US_institution__c'] = True
df_sf['Name'] = unique_domain_df['NAME']

# Transfer all matched columns to df_sf
for this_col in matched_columns:
    df_sf[this_col+'__c'] = unique_domain_df[this_col]

sf_original_fields = df_sf.columns

# display
df_sf.head(2).T

Unnamed: 0,3687,3996
Domain__c,phoenix.edu,wgu.edu
is_US_institution__c,True,True
Name,University of Phoenix-Arizona,Western Governors University
HBCU__c,No,No
BASIC2018__c,Doctoral/Professional Universities,Master's Colleges & Universities: Larger Programs
MSI__c,No,No


## To Salesforce Sales Cloud CRM

In [29]:
from DB2SalesforceAPI import DB2SalesforceAPI

# create DB2 to Salesforce API object
db_s = DB2SalesforceAPI(sf_login_params)

# specify Salesforce object ID and external ID
db_s.object_id = object_id
db_s.external_id = external_id

Obtained Salesforce access token ...... True


In [30]:
# send data to Salesforce
db_s.send_data(df_sf)

[Success] Bulk job creation successful. Job ID = 7505w00000M3agEAAR
[Success] CSV upload successful. Job ID = 7505w00000M3agEAAR
[Success] Closing job successful. Job ID = 7505w00000M3agEAAR


In [31]:
# check status
db_s.check_bulk_status()

{'id': '7505w00000M3agEAAR',
 'operation': 'upsert',
 'object': 'organization__c',
 'createdById': '0055w00000ArpYvAAJ',
 'createdDate': '2020-05-21T01:35:37.000+0000',
 'systemModstamp': '2020-05-21T01:35:38.000+0000',
 'state': 'InProgress',
 'externalIdFieldName': 'Domain__c',
 'concurrencyMode': 'Parallel',
 'contentType': 'CSV',
 'apiVersion': 47.0,
 'jobType': 'V2Ingest',
 'lineEnding': 'LF',
 'columnDelimiter': 'COMMA',
 'numberRecordsProcessed': 0,
 'numberRecordsFailed': 0,
 'retries': 0,
 'totalProcessingTime': 0,
 'apiActiveProcessingTime': 0,
 'apexProcessingTime': 0}

In [32]:
from pprint import pprint
pprint(db_s.check_bulk_failed_results())

''
