## general_import.py transposed to a .ipynb file for debugging

In [1]:
import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import numpy as np
import pandas as pd
from copy import deepcopy
import os, sys
from pathlib import Path
import platform
from copy import deepcopy
from nanoHUB.application import Application
import time
from googleapiclient.discovery import build
from apiclient import errors
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http
from nanoHUB.logger import logger
application = Application.get_instance()
# nanohub_db = application.new_db_engine('nanohub')
# nanohub_metrics_db = application.new_db_engine('nanohub_metrics')
# wang159_myrmekes_db = application.new_db_engine('wang159_myrmekes')

salesforce = application.new_salesforce_engine()
db_s = salesforce
log = logger('nanoHUB:google_imports')


SHOULD_IMPORT_FRESH = False
SHOULD_IMPORT_FRESH = True

[1mnanoHUB - Serving Students, Researchers & Instructors[0m
[INFO] [DB2SalesforceAPI - root]: Obtained Salesforce access token ...... True [DB2SalesforceAPI.obtain_token:54]


## Setup GDrive API

In [3]:
from __future__ import print_function
import os.path
import os
import errno
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google.oauth2 import service_account

## stuff that's rather hard to find from documentation
from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload

service = application.new_google_api_engine()

In [4]:
FOLDER_NAME = 'Salesforce_Imports'
TO_IMPORT_FOLDER_NAME = 'To_Import'
FAILURES_FOLDER_NAME = 'Import_Issues'
SUCCESS_FOLDER_NAME = 'Imported'

In [5]:
import_dir = Path(os.getenv('APP_DIR') + '/.cache/')

import_dir_path = Path.joinpath(import_dir, FOLDER_NAME)
import_dir_path.mkdir(parents=True, exist_ok=True)

to_import_dir_path = Path.joinpath(import_dir_path, TO_IMPORT_FOLDER_NAME)
to_import_dir_path.mkdir(parents=True, exist_ok=True)

failures_import_dir_path = Path.joinpath(import_dir_path, FAILURES_FOLDER_NAME)
failures_import_dir_path.mkdir(parents=True, exist_ok=True)

success_dir_path = Path.joinpath(import_dir_path, SUCCESS_FOLDER_NAME)
success_dir_path.mkdir(parents=True, exist_ok=True)

In [6]:
def get_folder_id(service, folder_name: str):
    response = service.files().list(
        q = "mimeType = 'application/vnd.google-apps.folder' and name = '" + folder_name + "'",
        spaces='drive',
        fields="files(id, name)"
    ).execute()
    folder = response.get('files', [])[0]
    return folder.get('id')

def get_subfolder_id(service, parent_folder_id: str, subfolder_name: str):
    response = service.files().list(
        q = "mimeType = 'application/vnd.google-apps.folder' and name = '" + subfolder_name + "'" + " and '" + parent_folder_id + "' in parents",
        spaces='drive',
        fields="files(id, name)"
    ).execute()
    folder = response.get('files', [])[0]
    return folder.get('id')

def change_folder_for_file(file_id: str, old_folder_id: str, new_folder_id: str):
    return service.files().update(
        fileId=file_id,
        removeParents=old_folder_id,
        addParents=new_folder_id,
        fields='id, parents'
    ).execute()

In [7]:
folder_id = get_folder_id(service, FOLDER_NAME)
log.debug('Found folder: %s (%s)' % (FOLDER_NAME, folder_id))
    
subfolder_id = get_subfolder_id(service, folder_id, TO_IMPORT_FOLDER_NAME)
log.debug('Found subfolder: %s (%s)' % (TO_IMPORT_FOLDER_NAME, subfolder_id))

In [8]:
mimetypes = """
    mimeType = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' or
    mimeType = 'application/vnd.ms-excel' or
    mimeType = 'text/csv'
    """
file_ids = []
file_names = []
page_token = None
# q = "(" + mimetypes + ") and '" + folder.get('id') + "' in parents"
while True:
    query = "(" + mimetypes + ") and '" + subfolder_id + "' in parents"
    response = service.files().list(
        q = query,
        spaces='drive',
        pageToken=page_token,
        fields="nextPageToken, files(id, name)"
    ).execute()
    log.debug(response)
    files = response.get('files', [])
    for file in files:
        log.info('Found file: %s (%s)' % (file.get('name'), file.get('id')))
        file_names.append(file['name'])
        file_ids.append(file['id'])
    page_token = response.get('nextPageToken', None)
    if page_token is None:
        break



[INFO] [3985917202 - nanoHUB:google_imports]: Found file: Chem group and Tool users.csv (1-3r0w-qig4FcwhaKTp51-WizDugcJDkI) [3985917202.<cell line: 11>:21]


In [9]:
import io 
import shutil 

In [10]:
list_files = []
to_import_file_paths = []

for filename in os.listdir(to_import_dir_path):
    file_path = os.path.join(to_import_dir_path, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)
        log.debug("Removed existing file %s" % file_path)
    except Exception as e:
        log.error('Failed to delete %s. Reason: %s' % (file_path, e))

try:
    for temp_index,f_tbd_id in enumerate(file_ids):
        request = service.files().get_media(fileId=f_tbd_id) #,mimeType='text/csv') #if not .csv, then do .export()
        fh = io.BytesIO()
        downloader = MediaIoBaseDownload(fh, request)
        done = False
        while done is False:
            status, done = downloader.next_chunk()
            log.debug("Downloading %s %d%%." % (file_names[temp_index], int(status.progress() * 100)))

        # The file has been downloaded into RAM, now save it in a file
        # https://stackoverflow.com/questions/60111361/how-to-download-a-file-from-google-drive-using-python-and-the-drive-api-v3
        fh.seek(0)

        download_filepath = Path(to_import_dir_path, file_names[temp_index])
        with open(download_filepath, 'wb') as f:
            shutil.copyfileobj(fh, f)

        list_files.append(file_names[temp_index])
        to_import_file_paths.append(download_filepath)

    log.debug("Finished downloading files")
except Exception as e:
    log.error("Error downloading file: " + str(e))
    raise

In [11]:
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)


## Perform Sequential Imports

In [12]:
success_files = []
fail_files = []

In [13]:
from chardet import detect
import cchardet
# get file encoding type
def get_encoding_type(file_path):
    with open(file_path, 'rb') as f:
        rawdata = f.read()
    return detect(rawdata)

In [14]:
from charset_normalizer import CharsetNormalizerMatches as CnM

for file in to_import_file_paths:

    log.debug("Processing file: " + str(file) + '---->')

    #from_chardet = get_encoding_type(str(file))
    #log.info('From chardet ----> ' + from_chardet['encoding'])

    from_normalizer = CnM.from_path(str(file)).best().first().encoding
    log.debug('From Normalizer ----> ' + from_normalizer)

In [15]:
from charset_normalizer import from_path

def change_encoding(file_path: str):
    try:
        with open(file_path, 'w') as filetowrite:
            filetowrite.write(results)
    except IOError as e:
        log.debug('Sadly, we are unable to perform charset normalization.', str(e))

In [16]:
def read_file_to_df(file_path: str) -> pd.DataFrame:
    possible_encodings = [
        'utf-8', 'cp1252', 'utf-16', 'cp1254', 'cp775', 'utf-8-sig', 'iso-8859-1', 'unicode_escape', 'gbk','latin1'
    ]
    possible_separators = [',', '\t']
    possible_engines = ['c', 'python']

    for encoding in possible_encodings:
        for sep in possible_separators:
            for engine in possible_engines:
                try:
                    idf = pd.read_csv(file_path, encoding=encoding, sep=sep, engine=engine)
                    log.info("File with Encoding: %s and Separator: %s processed using engine %s" % (encoding, sep, engine))
                    return idf
                except Exception as e:
                    log.debug("Decoding failed with %s, %s, %s" % (encoding, sep, engine))
                    pass
    log.info("Either the file (%s) is empty or unable to decode." % file_path)
    return pd.DataFrame()

In [17]:
def process_excel_file(file):
    try:
        xl = pd.ExcelFile(file)
        log.debug("sheet names: " + xl.sheet_names)# see all sheet names
        sheet_names = xl.parse(xl.sheet_names) #this already performs an import
        idf = pd.read_excel(file,sheet_name=xl.sheet_names[0],header=0)#,skiprows=1)
    except:
        log.error('error bad lines, csv/xls/xlsx import failed')
        raise TypeError

In [18]:
class EmptyOrEncodingIssue(TypeError):
    """Either the file is empty or was unable to be read properly."""

for file in to_import_file_paths:
    idf = pd.DataFrame()
    log.info("Reading %s" % file)
    try:
        f_type = file.suffix
        log.info("File type %s" % f_type)
        #pandas import # add exception handling - UnicodeError
        if f_type == '.csv':
            log.debug("csv file type identified")
            idf = read_file_to_df(file)
            if idf.empty:
                raise EmptyOrEncodingIssue
        elif f_type == '.xlsx' or f_type == '.xls':
            process_excel_file(file)

        ## remove leading and trailing spaces ## add str.strip spaces for all columns and rename
        log.debug(idf.columns)
        prev_idf_cols = idf.columns
        idf_cols = [i.strip() for i in idf.columns]
        idf.columns = idf.columns.str.strip()
        idf = idf.rename(columns={j:idf_cols[i] for i,j in enumerate(prev_idf_cols)})

        # if engagement venue does not exist, then create a flag with that entry
        try:
            #if engagement venue is specified
            log.debug(idf['Engagement Venue'])
            log.debug(idf['First Name'])
            idf = idf.rename(columns={'Engagement Venue':'Venue__c'})
            # idf = idf.rename(columns={'First Name':'firstname','Last Name':'lastname'})
        except:
            #decide the event
            #event_extract = xl.sheet_names[0]
            #idf['Engagement Venue'] = event_extract
            file_name = Path(file).name
            idf['Venue__c'] = file_name.split('.')[0]

            try:
                #name extract
                names = idf['Name'].to_list()
                #display(idf)
                
                fname = deepcopy(names)
                lname = deepcopy(names)
                for ind, val in enumerate(names):
                    val = val.split(' ')
                    fname[ind] = val[-1]
                    lname[ind] = val[0]

                idf['firstname'] = fname
                idf['lastname'] = lname
                name_flag = True
            except:
                name_flag = False
                log.info('no name')

        #rename columns
        idf = idf.rename(columns={'email':'Email','EMAIL':'Email','E-mail Address':'Email',\
                    'Email Address':'Email','Recipient Email':'Email'})
        idf = idf.rename(columns={'Engagement Venue':'Venue__c'})
        idf = idf.rename(columns={'First Name':'firstname','Last Name':'lastname','FirstName':'firstname','LastName':'lastname'})

        idf = idf.rename(columns={0:'Email'})
        if name_flag == True:
            idf = idf.drop(columns='Name')#['NAME','LAST NAME','FIRST NAME'])            
        idf = idf.dropna(subset=['Email'])

        #email check rows
        grows = []
        brows = []
        for ind,val in enumerate(idf['Email'].to_list()):
            if '@' in val:
                grows.append(ind)
            else:
                brows.append(ind)    
        idf = idf.iloc[grows,:].reset_index().iloc[:,1:]        

        log.info("Number of good emails found: %d" %len(grows))
        log.info("Number of bad emails found: %d" % len(brows))

        ## Import in contacts
        os_name = os.name
        sys_name = platform.system() #Linux, Darwin, Windows    

        # salesforce queries for contact data
        # deciding the queries
        import_df_cols = deepcopy(idf.columns)
        nh_id_flag = False
        email_flag = False
        if 'nanoHUB_user_ID__c' in import_df_cols:
            nh_id_flag = True

        if 'Email' in import_df_cols:
            email_flag = True    

        if nh_id_flag == True and email_flag == True:
            sf_df = db_s.query_data('SELECT Id,nanoHUB_user_ID__c, Email, Venue__c FROM Contact')#,sys_name=sys_name)
        elif email_flag == True:
            sf_df = db_s.query_data('SELECT Id,nanoHUB_user_ID__c, Email, Venue__c FROM Contact')#,sys_name=sys_name)    

        # find all existing contacts
        sf_emails = sf_df['Email'].to_list()
        grows = []
        brows = [] #dont need the sf_bad_rows as send to leads
        sf_grows = []
        for ind,val in enumerate(idf['Email'].to_list()):
            val = val.strip()
            if val in sf_emails:
                grows.append(ind)
                sf_grows.append(sf_emails.index(val))
            else:
                brows.append(ind)   

        # pull the matching SF entries and the matching import df entries
        sf_df_match = sf_df.iloc[sf_grows,:].reset_index().iloc[:,1:]
        idf_match = idf.iloc[grows,:].reset_index().iloc[:,1:]
        lead_df = idf.iloc[brows,:].reset_index().iloc[:,1:] #use this in next section

        #log.debug(sf_df_match.head())
        #log.debug(idf_match.head())

        # linear join since the sequence is matching
        for ind,val in enumerate(sf_df_match['Venue__c']):
            try:
                val = val.split(';')
                val.append(idf['Venue__c'][ind])
                val = ';'.join(val)
            except:
                val = idf['Venue__c'][ind]
            sf_df_match['Venue__c'][ind] = val

        #log.debug(sf_df_match.head())

        ## delete duplicates
        venues = sf_df_match['Venue__c'].to_list()
        for ind,val in enumerate(venues):
            venues[ind]=';'.join(list(dict.fromkeys(val.split(';'))))
        sf_df_match['Venue__c'] = venues
    #     display(sf_df_match.head(5))

        ## encoding correction for dashes
        venues = sf_df_match['Venue__c'].apply(lambda x: x.replace('â\x80\x93','-'))
        sf_df_match['Venue__c'] = venues

        sf_df_match = sf_df_match.drop_duplicates()
    #     display(sf_df_match.head(5))

        sf_df_match = sf_df_match[['Email','Venue__c','Id']]
    #     display(sf_df_match.head(5))

        ## send to SF
        # rebuild api object
        db_s_c = deepcopy(db_s)

        log.info("Number of contacts: %d" % len(sf_df_match.index))
        # send data to SF
        db_s_c.object_id = 'Contact'
        # db_s_c.external_id = 'nanoHUB_user_ID__c'
        db_s_c.external_id = 'Id'

        db_s_c.send_data(sf_df_match)

        ## find leads and send them to SF as well
        #pull all current leads
        sf_df = db_s.query_data('SELECT Id, Email, Venue__c, SF_indexer__c FROM Lead')    
        # find the max sf_indexer
        indexers = sf_df['SF_indexer__c'].fillna(0).to_list()
        max_ind = max(indexers)    

        # find all existing leads
        sf_emails = sf_df['Email'].to_list()
        m_rows = []
        nm_rows = [] #don't need sf no match rows
        sf_mrows = []    

        for ind,val in enumerate(lead_df['Email'].to_list()):
            val = val.strip()
            if val in sf_emails:
                m_rows.append(ind)
                sf_mrows.append(sf_emails.index(val))
            else:
                nm_rows.append(ind)    

        # filter the matches
        sf_df_match = sf_df.iloc[sf_mrows,:].reset_index().iloc[:,1:]
        join_idf = lead_df.iloc[m_rows,:].reset_index().iloc[:,1:]
        new_idf = lead_df.iloc[nm_rows,:].reset_index().iloc[:,1:]    

        # linear join since the sequence is matching
        for ind,val in enumerate(sf_df_match['Venue__c']):
            try:
                val = val.split(';')
                #if 'MSE Summer Webinar Series 2020' in val:
                #    val.remove('MSE Summer Webinar Series 2020')
                #    if 'MSE Summer Webinar Series 2020' in val:
                #        val.remove('MSE Summer Webinar Series 2020')
                val.append(join_idf['Venue__c'][ind])
                val = ';'.join(val)
            except:
                val = join_idf['Venue__c'][ind]
            sf_df_match['Venue__c'][ind] = val    

        ## delete duplicates
        venues = sf_df_match['Venue__c'].to_list()
        for ind,val in enumerate(venues):
            venues[ind] = ';'.join(list(dict.fromkeys(val.split(';'))))
        sf_df_match['Venue__c'] = venues

        # assign new SF_indexers for the new leads
        new_max_ind = int(max_ind+new_idf.shape[0])
        new_idf['SF_indexer__c'] = range(int(max_ind)+1,new_max_ind+1)    

        # need non-empty company field
        new_idf['Company'] = '-'    

        # ensure 'â\x80\x93' has been replaced
        sf_df_match['Venue__c'] = sf_df_match['Venue__c'].apply(lambda x: x.replace('â\x80\x93','-'))
        new_idf['Venue__c'] = new_idf['Venue__c'].apply(lambda x: x.replace('â\x80\x93','-'))

        sf_df_match['Company'] = '-'
        new_idf['Company'] = '-'
        #log.debug(new_idf)
        # sys.exit()

        # populate the company fields for sf_df_match and new_idf by comparing email addresses
        if 'Company' in idf.columns:
            log.info("company exist, using it")
            # comparison for sf_df_match

            sf_df_match_comp_ind = sf_df_match.columns.to_list().index('Company')
            for ind,val in enumerate(idf['Email'].to_list()):
                if val in sf_df_match['Email'].to_list():
                    sf_df_match_ind = sf_df_match['Email'].to_list().index(val)
                    sf_df_match.iloc[sf_df_match_ind,sf_df_match_comp_ind] = deepcopy(idf['Company'].to_list()[ind])

            # sf_df_match sf_indexer if not available
            sf_df_match_sf_ind = sf_df_match.columns.to_list().index('SF_indexer__c')
            new_max_ind = int(max_ind+new_idf.shape[0])
            for ind,val in enumerate(sf_df_match['SF_indexer__c'].to_list()):
                if type(val) != int and type(val) != float:
                    sf_df_match.iloc[ind,sf_df_match_sf_ind] = new_max_ind+1
                    new_max_ind += 1

            new_idf_comp_ind = new_idf.columns.to_list().index('Company')
            for ind,val in enumerate(idf['Email'].to_list()):
                if val in new_idf['Email'].to_list():
                    new_idf_ind = new_idf['Email'].to_list().index(val)
                    new_idf.iloc[new_idf_ind,new_idf_comp_ind] = deepcopy(idf['Company'].to_list()[ind])
        else:
            log.info('no company in import list, set to -')    
        
        if sf_df_match.shape[0] == 0 and new_idf.shape[0] == 0: 
            log.info('no leads to import, they were all contacts')
        else:
            sf_df_match = sf_df_match.fillna('-')
            new_idf = new_idf.fillna('-')

            try:
                new_idf = new_idf.rename(columns={'First Name':'firstname','Last Name':'lastname'})
            except:
                log.info('names are good')          

            try:
                new_idf = new_idf[['Email','firstname','lastname','SF_indexer__c','Venue__c']]
                new_idf['Company'] = '-'
            except: # names are not present
                try:
                    tempnames = new_idf['Faculty'].apply(lambda x: x.split(' '))
                    tf_names = [i[0] for i in tempnames]
                    tl_names = [i[-1] if len(i[-1]) > 0 else '-' for i in tempnames]
                    new_idf['firstname'] = tf_names
                    new_idf['lastname'] = tl_names
                    new_idf = new_idf[['Email','firstname','lastname','SF_indexer__c','Venue__c']]
                    new_idf['Company'] = '-'    
                except:
                    tempnames = new_idf['Name'].to_list()
                    temp_fname = []
                    temp_lname = []
                    for t_ind,t_val in enumerate(tempnames):
                        t_val = t_val.split(' ')
                        temp_fname.append(t_val[0])
                        if len(t_val[-1]) > 0:
                            temp_lname.append(t_val[-1])
                        else:
                            temp_lname.append('-')

                    new_idf['firstname'] = temp_fname
                    new_idf['lastname'] = temp_lname
                    new_idf = new_idf[['Email','firstname','lastname','SF_indexer__c','Venue__c']]
                    new_idf['Company'] = '-'

            #drop duplicate rows
            sf_df_match = sf_df_match.drop_duplicates(subset='SF_indexer__c')
            new_idf = new_idf.drop_duplicates()    

            sf_df_match['Company'] = sf_df_match['Company'].replace('  ','-')
            sf_df_match = sf_df_match.drop(columns='SF_indexer__c')

            #send the matching ones
            db_s_l1 = deepcopy(db_s)

            # send data to SF
            db_s_l1.object_id = 'Lead'
            # db_s_l1.external_id = 'SF_indexer__c'
            db_s_l1.external_id = 'Id'
            
            log.info("Number of existing leads: %d" % len(new_idf.index))
            db_s_l1.send_data(sf_df_match)

            #send the new ones
            db_s_l2 = deepcopy(db_s)

            # send data to SF
            db_s_l2.object_id = 'Lead'
            db_s_l2.external_id = 'SF_indexer__c'
            
            log.info("Number of new leads: %d" % len(new_idf.index))
            db_s_l2.send_data(new_idf)

        success_files.append(file)

    except Exception as e:
        log.error("Error with file: " + str(e))
        fail_files.append(file)

[INFO] [141495456 - nanoHUB:google_imports]: Reading /home/saxenap/nanoHUB/.cache/Salesforce_Imports/To_Import/Chem group and Tool users.csv [141495456.<cell line: 4>:6]
[INFO] [141495456 - nanoHUB:google_imports]: File type .csv [141495456.<cell line: 4>:9]
[INFO] [34983876 - nanoHUB:google_imports]: File with Encoding: utf-8 and Separator: , processed using engine c [34983876.read_file_to_df:13]
[INFO] [141495456 - nanoHUB:google_imports]: no name [141495456.<cell line: 4>:57]
[INFO] [141495456 - nanoHUB:google_imports]: Number of good emails found: 16513 [141495456.<cell line: 4>:80]
[INFO] [141495456 - nanoHUB:google_imports]: Number of bad emails found: 0 [141495456.<cell line: 4>:81]
[INFO] [DB2SalesforceAPI - root]: [Success] Bulk job creation successful. Job ID = 7508W00000ljcbrQAA [DB2SalesforceAPI.query_data:80]
[INFO] [DB2SalesforceAPI - root]: {"id":"7508W00000ljcbrQAA","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-08-30T19:47

In [19]:
tbd_imp_names = file_names
tbd_imp_ids = file_ids

In [20]:
imported_folder_id = get_subfolder_id(service, folder_id, SUCCESS_FOLDER_NAME)
failure_folder_id = get_subfolder_id(service, folder_id, FAILURES_FOLDER_NAME)

if not success_files and not fail_files:
    log.info("No files found to process.")
    exit(0)

else: 
    for i in success_files:
        file_name = Path(i).name 
        t_index = tbd_imp_names.index(file_name) 
        file_id = tbd_imp_ids[t_index]
        log.info("Success in proccesing file: %s with id: %s" % (file_name, file_id))
        change_folder_for_file(file_id, subfolder_id, imported_folder_id)
        i.rename(Path(success_dir_path) / i.name)

    for i in fail_files:
        file_name = Path(i).name 
        t_index = tbd_imp_names.index(file_name)
        file_id = tbd_imp_ids[t_index]
        log.info("Failure in proccesing file: %s with id: %s" % (file_name, file_id))
        change_folder_for_file(file_id, subfolder_id, failure_folder_id)
        i.rename(Path(failures_import_dir_path) / i.name)

[INFO] [739382289 - nanoHUB:google_imports]: Success in proccesing file: Chem group and Tool users.csv with id: 1-3r0w-qig4FcwhaKTp51-WizDugcJDkI [739382289.<cell line: 4>:13]
