## general_import.py transposed to a .ipynb file for debugging

In [5]:
prefix = 'IMPORT AS'

import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
warnings.simplefilter(action='ignore', category=FutureWarning)

import io 
import shutil 
import re
import numpy as np
import pandas as pd
from copy import deepcopy
import os, sys
from pathlib import Path
import platform

from nanoHUB.application import Application
from nanoHUB.pipeline.SF_dataimports.file_handling import map_file
from nanoHUB.pipeline.SF_dataimports.data_handling import split_names, filter_bad_emails, strip_spaces_cols, rename_columns, add_venue
from nanoHUB.pipeline.SF_dataimports.google_downloads import DefaultGoogleFactory

import time
from nanoHUB.logger import logger

log = logger('nanoHUB:google_imports')
pd.set_option('display.max_columns', 1000)
pd.set_option('max_colwidth', None)

application = Application.get_instance()
salesforce = application.new_salesforce_engine()
db_s = salesforce


from __future__ import print_function
import os.path
import os
import errno
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google.oauth2 import service_account
from apiclient import errors
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http

## stuff that's rather hard to find from documentation


Obtained Salesforce access token ...... True


## Setup GDrive API

In [6]:
downloader_factory = DefaultGoogleFactory(application, log)
downloader = downloader_factory.get_downloader()
dir_config = downloader_factory.get_folder_config()

to_import_file_paths, file_names, file_ids = downloader.process()

print(file_names)
print(file_ids)

[INFO] [google_downloads - nanoHUB:google_imports]: Found file: nH_CustSatSurvey2022_import.csv (1TYhD3qESgpgyaWLhgd3hPKz3outo0zC9) [google_downloads.process:99]
['nH_CustSatSurvey2022_import.csv']
['1TYhD3qESgpgyaWLhgd3hPKz3outo0zC9']


## Perform Sequential Imports

In [7]:
success_files = []
fail_files = []

sf_df = db_s.query_data('SELECT Id,nanoHUB_user_ID__c, Email, Venue__c FROM Contact') 
display(sf_df)

[Success] Bulk job creation successful. Job ID = 7505w00000cydEJAAY
{"id":"7505w00000cydEJAAY","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-02-01T00:26:33.000+0000","systemModstamp":"2022-02-01T00:26:33.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000cydEJAAY","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-02-01T00:26:33.000+0000","systemModstamp":"2022-02-01T00:26:33.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000cydEJAAY","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-02-01T00:26:33.000+0000","systemModst

Unnamed: 0,Email,Id,Venue__c,nanoHUB_user_ID__c
0,nkissebe@gmail.com,0035w000031Vsp1AAC,2022 Customer Satisfaction Survey recipient;nH_CustSatSurvey2022_import,998.0
1,support@nanohub.org,0035w000031Vsp2AAC,,1683.0
2,gridstat@nanohub.org,0035w000031Vsp3AAC,,1684.0
3,ncn@nanohub.org,0035w000031Vsp4AAC,,1685.0
4,apps@nanohub.org,0035w000031Vsp5AAC,,1686.0
...,...,...,...,...
252090,mkhan118@fiu.edu,0035w00003YXslPAAT,2022 Customer Satisfaction Survey recipient;nH_CustSatSurvey2022_import,345960.0
252091,asu970602@naver.com,0035w00003YXslQAAT,2022 Customer Satisfaction Survey recipient;nH_CustSatSurvey2022_import,345961.0
252092,kelsey7@purdue.edu,0035w00003YXslRAAT,2022 Customer Satisfaction Survey recipient;nH_CustSatSurvey2022_import,345962.0
252093,anand50@purdue.edu,0035w00003YXslSAAT,,345963.0


In [9]:
for file in to_import_file_paths:
    idf = pd.DataFrame()
    log.info("Reading %s" % file)
    try:
        idf = map_file(file, logger('nanoHUB:google_imports'))

        ## remove leading and trailing spaces ## add str.strip spaces for all columns and rename
        log.debug(idf.columns)
        idf = strip_spaces_cols(idf)

        idf, has_name = split_names(idf)
        if not has_name:
            log.info('No name in column `Name` needing to be split found')

        #rename columns
        idf = rename_columns(idf)
        if has_name == True:
            idf = idf.drop(columns='Name')#['NAME','LAST NAME','FIRST NAME'])            
        idf = idf.dropna(subset=['Email'])
        
        idf = filter_bad_emails(idf, log)

        # salesforce queries for contact data
        # deciding the queries
        import_df_cols = deepcopy(idf.columns)
        nh_id_flag = False
        email_flag = False
        if 'nanoHUB_user_ID__c' in import_df_cols:
            nh_id_flag = True

        if 'Email' in import_df_cols:
            email_flag = True    
        
        
        merged_df = idf.merge(sf_df, how = 'inner', on = ['Email'])
        log.info("Rows of data after merge: %d" % len(merged_df))
        
        prefixed_df = merged_df.filter(regex=re.compile(prefix, re.IGNORECASE))
        prefixed_df.columns = prefixed_df.columns.str.replace(prefix, "", case = False)
        prefixed_df.columns = prefixed_df.columns.str.lstrip(' ')
        log.info("Number of custom prefixed columns: %d" % len(prefixed_df.columns))
        log.debug(prefixed_df.columns)
        
        #merged_df = add_venue(merged_df)
        merged_df["Venue__c"] = merged_df["Venue__c"] + ';' + merged_df["Import as Venue__c"]
        
        merged_df = merged_df[['Email','Id', 'Venue__c']]
        # display(merged_df)
        
        cols_to_use = prefixed_df.columns.difference(merged_df.columns)
        merged_df = pd.merge(merged_df, prefixed_df[cols_to_use], left_index=True, right_index=True, how='outer')
        
        log.info("Number of contacts: %d" % len(merged_df.index))
        log.info(merged_df.head())
        log.info(merged_df.tail())
        
        db_s_c = deepcopy(db_s)
        db_s_c.object_id = 'Contact'
        db_s_c.external_id = 'Id'
        db_s_c.send_data(merged_df)
        
        leads_df = idf[~idf['Email'].isin(sf_df['Email'])]
        log.info("Number of leads: %d" % len(leads_df))
        
    except Exception as e:
        log.error("Error with file: " + str(e))
        fail_files.append(file)
        

[INFO] [486459681 - nanoHUB:google_imports]: Reading /home/saxenap/nanoHUB/.cache/Salesforce_Imports/To_Import/nH_CustSatSurvey2022_import.csv [486459681.<module>:3]
[INFO] [file_handling - nanoHUB:google_imports]: File type .csv [file_handling.map_file:106]
[INFO] [file_handling - nanoHUB:google_imports]: Encoding is of type: utf_8 [file_handling.map_file:109]
[INFO] [file_handling - nanoHUB:google_imports]: File with Encoding: utf-8 and Separator: , processed using engine c [file_handling.read_file_to_df:53]
[INFO] [486459681 - nanoHUB:google_imports]: No name in column `Name` needing to be split found [486459681.<module>:13]
[INFO] [data_handling - nanoHUB:google_imports]: Number of good emails found: 18164 [data_handling.filter_bad_emails:37]
[INFO] [data_handling - nanoHUB:google_imports]: Number of bad emails found: 0 [data_handling.filter_bad_emails:38]
[INFO] [486459681 - nanoHUB:google_imports]: Rows of data after merge: 17018 [486459681.<module>:36]
[INFO] [486459681 - nanoHU