In [1]:
import os
import glob
import time
import json
import concurrent.futures
import subprocess

In [2]:
input_folder = r"./Data/"
file_type = ".tsv.gz"

NUM_WORKERS = int(os.environ['NUMBER_OF_PROCESSORS'])
print("There are {} number of processors".format(NUM_WORKERS))

files = glob.glob(input_folder+"*.tsv.gz")
print("Total Number of files to upload: {}".format(len(files)))

There are 8 number of processors
Total Number of files to upload: 4


In [3]:
def load_configuration(file):
    try:
        conf = open(file).read()
        conf = json.loads(conf)
        values = list(conf.values())
        if None in values:
            print("Please confirm all fields are mentioned in the credential files! Try again!")
        return conf
    except:
        print("Errro Occurred! Please check if file is available!")

In [4]:
conf = load_configuration("./Configuration/conf.json")
project_id = conf["project_id"]
project_name = conf["project_name"]
dataset_name = conf["dataset_name"]
table_name = conf["table_name"]

In [5]:
def uploadFileToBigQueryTable(filename):
    path, file = os.path.split(filename)
    print("Uploading file {} on big query \n".format(file))
    command = "bq --location=US load --null_marker=NULL --skip_leading_rows 1 --quote \"\" "
    command += "-E UTF-8 --source_format=CSV --autodetect --field_delimiter \\t {}.{} {}".format(dataset_name, table_name, filename)
    
    setup_command = "gcloud config set project {}".format(project_id)
    result = subprocess.check_output(setup_command, shell=True)
    result = subprocess.check_output(command, shell=True)
    
    return result

In [6]:
start = time.time()

command = " gcloud auth login"
result = os.system(command)

if int(result) == 0:
    
    if NUM_WORKERS > len(files):
        NUM_WORKERS = len(files)
    
    print("Using {} workers to upload files for parallel processing".format(NUM_WORKERS))
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = {executor.submit(uploadFileToBigQueryTable, file) for file in files}
        concurrent.futures.wait(futures)

    executor.shutdown()
    end = time.time()
    print("Total Time for uploading of files is {} secs!".format(end-start))

else:
    print("Please login to your GCP Account!")

Using 4 workers to upload files for parallel processing
Uploading file amazon_reviews_multilingual_FR_v1_00.tsv.gz on big query 
Uploading file amazon_reviews_multilingual_JP_v1_00.tsv.gz on big query 


Uploading file amazon_reviews_us_Gift_Card_v1_00.tsv.gz on big query 

Uploading file amazon_reviews_us_Personal_Care_Appliances_v1_00.tsv.gz on big query 

Total Time for uploading of files is 111.0043694972992 secs!
