### Configure Spark-GCS and Spark-BigQuery Connector

In [1]:
# Suppress Warnings
import warnings
warnings.filterwarnings("ignore")

import os
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [2]:
creds_location = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

In [3]:
conf = SparkConf() \
    .setMaster("spark://vm-instance-nyc-taxi.asia-southeast1-a.c.de-project-nyc-taxi.internal:7077") \
    .setAppName("gcs_to_bigquery") \
    .set(
        "spark.jars",
        "/home/salacjamesrhode23/connectors/gcs-connector-hadoop3-2.2.5.jar,"
        "/home/salacjamesrhode23/connectors/spark-bigquery-with-dependencies_2.12-0.42.4.jar"
    ) \
    .set("google.cloud.auth.service.account.enable", "true") \
    .set("google.cloud.auth.service.account.json.keyfile", creds_location)

In [4]:
sc = SparkContext(conf=conf)
hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", creds_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")



25/11/13 10:34:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [6]:
spark.conf.set("temporaryGcsBucket","temporary_bucket_001")

In [15]:
spark

### Data Transformation and Ingestion Logic + Write to BigQuery in parallel

In [16]:
df_customers = spark.read.csv("gs://ecomm_bucket001/output_files/from_api/customers.csv", header=True, inferSchema=True)
df_products = spark.read.csv("gs://ecomm_bucket001/output_files/from_api/products.csv", header=True, inferSchema=True)

In [17]:
df_products.show(5)

+--------------------+--------------------+--------------------+-----------+--------------------+----------+-------+
|           Image Src|    Product Category| Product Description|Product SKU|               Title|Unit Price| Vendor|
+--------------------+--------------------+--------------------+-----------+--------------------+----------+-------+
|https://cdn.shopi...|Building & Model ...|Build the power t...|     TOY500|Mega Construx Pok...|     33.55|POKEMON|
|https://cdn.shopi...|Building & Model ...|Create action-pac...|     TOY499|Mega Construx Pok...|     14.99|POKEMON|
|https://cdn.shopi...|Building & Model ...|This limited-edit...|     TOY498|Limited Edition S...|      40.0|POKEMON|
|https://cdn.shopi...|Building & Model ...|Pokemon trainers ...|     TOY497|Mega Construx Pok...|     23.98|POKEMON|
|https://cdn.shopi...|Building & Model ...|Create classic ba...|     TOY496|Mega Construx Pok...|     14.99|POKEMON|
+--------------------+--------------------+--------------------+

In [18]:
# Write to customers table
df_customers.write \
    .format("bigquery") \
    .option("table", "de-project-nyc-taxi.ecomm_staging.raw_customers") \
    .option("temporaryGcsBucket", "ecomm_bucket001") \
    .mode("append") \
    .save()

# Write to products table
df_products.write \
    .format("bigquery") \
    .option("table", "de-project-nyc-taxi.ecomm_staging.raw_products") \
    .option("temporaryGcsBucket", "ecomm_bucket001") \
    .mode("append") \
    .save()

                                                                                

In [None]:
break

### Data Transformation and Ingestion Logic + Read and Write multiple csv files to BigQuery in parallel

In [19]:
df_email_orders = spark.read.csv("gs://ecomm_bucket001/output_files/from_emails/*.csv", header=True, inferSchema=True)
df_database_orders = spark.read.csv("gs://ecomm_bucket001/output_files/from_database/*.csv", header=True, inferSchema=True)
df_faker_orders = spark.read.parquet("gs://ecomm_bucket001/output_files/from_faker/*.parquet", header=True, inferSchema=True)

                                                                                

In [None]:
df_faker_orders.show(5)

In [20]:
# Write to customers table
df_email_orders.write \
    .format("bigquery") \
    .option("table", "de-project-nyc-taxi.ecomm_staging.raw_orders_emails") \
    .option("temporaryGcsBucket", "ecomm_bucket001") \
    .mode("append") \
    .save()

                                                                                

In [21]:
# Write to orders table
df_database_orders.write \
    .format("bigquery") \
    .option("table", "de-project-nyc-taxi.ecomm_staging.raw_orders_postgres") \
    .option("temporaryGcsBucket", "ecomm_bucket001") \
    .mode("append") \
    .save()

                                                                                

In [22]:
# Write to orders table
df_faker_orders.repartition(15) \
    .write \
    .format("bigquery") \
    .option("table", "de-project-nyc-taxi.ecomm_staging.raw_orders_faker") \
    .option("temporaryGcsBucket", "ecomm_bucket001") \
    .mode("append") \
    .save()

                                                                                