## general_import.py transposed to a .ipynb file for debugging

In [1]:
prefix = 'IMPORT AS'

import warnings
warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')
warnings.simplefilter(action='ignore', category=FutureWarning)

import io 
import shutil 
import re
import numpy as np
import pandas as pd
from copy import deepcopy
import os, sys
from pathlib import Path
import platform

from nanoHUB.application import Application
from nanoHUB.pipeline.SF_dataimports.file_handling import map_file
from nanoHUB.pipeline.SF_dataimports.data_handling import split_names, filter_bad_emails, strip_spaces_cols, rename_columns, add_venue
from nanoHUB.pipeline.SF_dataimports.google_downloads import DefaultGoogleFactory

import time
from nanoHUB.logger import logger

log = logger('nanoHUB:google_imports')
pd.set_option('display.max_columns', 500)

application = Application.get_instance()
salesforce = application.new_salesforce_engine()
db_s = salesforce


from __future__ import print_function
import os.path
import os
import errno
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google.oauth2 import service_account
from apiclient import errors
from oauth2client.service_account import ServiceAccountCredentials
from httplib2 import Http

## stuff that's rather hard to find from documentation


[1mnanoHUB - Serving Students, Researchers & Instructors[0m
Obtained Salesforce access token ...... True


## Setup GDrive API

In [2]:
downloader_factory = DefaultGoogleFactory(application, log)
downloader = downloader_factory.get_downloader()
dir_config = downloader_factory.get_folder_config()

to_import_file_paths, file_names, file_ids = downloader.process()

print(file_names)
print(file_ids)

[INFO] [google_downloads - nanoHUB:google_imports]: Found file: nH_CustSatSurvey2022_import.csv (19kHg2RKb2ZE0TRma3FUf9fuRyZ4RSiFs) [google_downloads.process:99]
['nH_CustSatSurvey2022_import.csv']
['19kHg2RKb2ZE0TRma3FUf9fuRyZ4RSiFs']


## Perform Sequential Imports

In [3]:
success_files = []
fail_files = []

sf_df = db_s.query_data('SELECT Id,nanoHUB_user_ID__c, Email, Venue__c FROM Contact') 
print(sf_df)

[Success] Bulk job creation successful. Job ID = 7505w00000cyCkNAAU
{"id":"7505w00000cyCkNAAU","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-28T03:27:46.000+0000","systemModstamp":"2022-01-28T03:27:46.000+0000","state":"UploadComplete","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","retries":0,"totalProcessingTime":0}
{"id":"7505w00000cyCkNAAU","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-28T03:27:46.000+0000","systemModstamp":"2022-01-28T03:27:47.000+0000","state":"InProgress","concurrencyMode":"Parallel","contentType":"CSV","apiVersion":47.0,"jobType":"V2Query","lineEnding":"LF","columnDelimiter":"COMMA","numberRecordsProcessed":19608,"retries":0,"totalProcessingTime":1376}
{"id":"7505w00000cyCkNAAU","operation":"query","object":"Contact","createdById":"0055w00000DM5bOAAT","createdDate":"2022-01-28T0

In [4]:
for file in to_import_file_paths:
    idf = pd.DataFrame()
    log.info("Reading %s" % file)
    try:
        idf = map_file(file, logger('nanoHUB:google_imports'))

        ## remove leading and trailing spaces ## add str.strip spaces for all columns and rename
        log.debug(idf.columns)
        idf = strip_spaces_cols(idf)

        idf, has_name = split_names(idf)
        if not has_name:
            log.info('No name in column `Name` needing to be split found')

        #rename columns
        idf = rename_columns(idf)
        if has_name == True:
            idf = idf.drop(columns='Name')#['NAME','LAST NAME','FIRST NAME'])            
        idf = idf.dropna(subset=['Email'])

        idf = filter_bad_emails(idf, log)

        # salesforce queries for contact data
        # deciding the queries
        import_df_cols = deepcopy(idf.columns)
        nh_id_flag = False
        email_flag = False
        if 'nanoHUB_user_ID__c' in import_df_cols:
            nh_id_flag = True

        if 'Email' in import_df_cols:
            email_flag = True    
        
        merged_df = idf.merge(sf_df, how = 'inner', on = ['Email'])
        
        prefixed_df = merged_df.filter(regex=re.compile(prefix, re.IGNORECASE))
        prefixed_df.columns = prefixed_df.columns.str.replace(prefix, "", case = False)
        prefixed_df.columns = prefixed_df.columns.str.lstrip(' ')
        log.info("Number of custom prefixed columns: %d" % len(prefixed_df.columns))
        log.debug(prefixed_df.columns)
        
        merged_df = add_venue(merged_df)
        display(merged_df)
        
        merged_df = merged_df[['Email','Id', 'Venue__c']]
        merged_df = merged_df.merge(prefixed_df, how = 'inner', left_index=True, right_index=True)
        display(merged_df)
        log.info("Number of contacts: %d" % len(merged_df.index))
        log.debug(merged_df.head())
        log.debug(merged_df.tail())
        
        # db_s_c = deepcopy(db_s)
        # db_s_c.object_id = 'Contact'
        # db_s_c.external_id = 'Id'
        # db_s_c.send_data(merged_df)
        
        leads_df = idf[~idf['Email'].isin(sf_df['Email'])]
        log.info("Number of leads: %d" % len(leads_df))
        
    except Exception as e:
        log.error("Error with file: " + str(e))
        fail_files.append(file)
        

[INFO] [291349747 - nanoHUB:google_imports]: Reading /home/saxenap/nanoHUB/.cache/Salesforce_Imports/To_Import/nH_CustSatSurvey2022_import.csv [291349747.<module>:3]
[INFO] [file_handling - nanoHUB:google_imports]: File type .csv [file_handling.map_file:106]
[INFO] [api - charset_normalizer]: Detected a SIG or BOM mark on first 3 byte(s). Priority +1 given for utf_8. [api.from_bytes:159]
[INFO] [api - charset_normalizer]: Code page utf_8 is a multi byte encoding table and it appear that at least one character was encoded using n-bytes. [api.from_bytes:257]
[INFO] [api - charset_normalizer]: utf_8 passed initial chaos probing. Mean measured chaos is 0.000000 % [api.from_bytes:352]
[INFO] [api - charset_normalizer]: We detected language [('English', 1.0), ('Indonesian', 1.0), ('Simple English', 1.0)] using utf_8 [api.from_bytes:382]
[INFO] [api - charset_normalizer]: utf_8 is most likely the one. Stopping the process. [api.from_bytes:417]
[INFO] [file_handling - nanoHUB:google_imports]: 

Unnamed: 0,lastname,firstname,Email,Import as CS_Qualtrics2022__c,Import as Venue__c,Import as X2022_test__c,Id,Venue__c,nanoHUB_user_ID__c
0,Abd el-Jawad,Ashraf,1700265@eng.asu.edu.eg,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003HY407AAD,2022 Customer Satisfaction Survey recipient;te...,309933.0
1,Hodge,Keymoi,keymoihodge@gmail.com,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003FVt6OAAT,2022 Customer Satisfaction Survey recipient;te...,303943.0
2,Mummareddy,Bhargavi,mummareddybhargavi@gmail.com,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003WQgdOAAT,2022 Customer Satisfaction Survey recipient;te...,343104.0
3,Kumar,Kundan,152002011@smail.iitpkd.ac.in,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003RRx24AAD,2022 Customer Satisfaction Survey recipient;te...,330024.0
4,Khan,Hassaan,hassaankhan271@gmail.com,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003WQgdPAAT,2022 Customer Satisfaction Survey recipient;te...,343105.0
...,...,...,...,...,...,...,...,...,...
17013,Millicety,Millicety,lucastaorianaky@gmail.com,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003WQeu7AAD,2022 Customer Satisfaction Survey recipient;te...,343040.0
17014,Wan,Hao,s3655236@student.rmit.edu.au,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003WQeu8AAD,2022 Customer Satisfaction Survey recipient;te...,343041.0
17015,Emdin,Dave,demdin1@student.ccp.edu,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003WQeuAAAT,2022 Customer Satisfaction Survey recipient;te...,343043.0
17016,Adhikary,Pankaj,pankajadhikary16@gmail.com,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True,0035w00003FVjVDAA1,2022 Customer Satisfaction Survey recipient;te...,303734.0


Unnamed: 0,Email,Id,Venue__c_x,CS_Qualtrics2022__c,Venue__c_y,X2022_test__c
0,1700265@eng.asu.edu.eg,0035w00003HY407AAD,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
1,keymoihodge@gmail.com,0035w00003FVt6OAAT,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
2,mummareddybhargavi@gmail.com,0035w00003WQgdOAAT,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
3,152002011@smail.iitpkd.ac.in,0035w00003RRx24AAD,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
4,hassaankhan271@gmail.com,0035w00003WQgdPAAT,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
...,...,...,...,...,...,...
17013,lucastaorianaky@gmail.com,0035w00003WQeu7AAD,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
17014,s3655236@student.rmit.edu.au,0035w00003WQeu8AAD,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
17015,demdin1@student.ccp.edu,0035w00003WQeuAAAT,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True
17016,pankajadhikary16@gmail.com,0035w00003FVjVDAA1,2022 Customer Satisfaction Survey recipient;te...,https://purdue.ca1.qualtrics.com/jfe/form/SV_3...,test_venue,True


[INFO] [291349747 - nanoHUB:google_imports]: Number of contacts: 17018 [291349747.<module>:48]
[INFO] [291349747 - nanoHUB:google_imports]: Number of leads: 1352 [291349747.<module>:58]
