In [None]:
import os
import json
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-1.8"
os.environ['PYSPARK_PYTHON'] = 'python'


In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext


In [None]:
conf = SparkConf() \
    .setAppName("Spark Sample - ETL") \
    .setMaster("local") \
    .set("spark.driver.extraClassPath","C:/pyspark/*")

sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

### Initialize Data Extraction

In [None]:
df=spark.read.options(delimiter=",", header=True).csv("./sales_generated.csv")

In [None]:
df.createOrReplaceTempView("national_sales")

In [None]:
""" Extracting individual date """

filtered_date = spark.sql("SELECT DISTINCT DATE(date) FROM national_sales ORDER BY 1 ASC")
filtered_date.show()

In [None]:
evaluate_json_date = filtered_date.toJSON().collect()

In [None]:
""" Iterate Transformation by each date and Load the transformed data into destionation """

# DATABASE TARGET CONFIG
dest_tbl = 'public."etl_sales_revenue_daily"'
database = "postgres"
password = "password"
user = "postgres"


for data in evaluate_json_date[:5]:
    current_date = json.loads(data)['date']
    result_by_date = spark.sql(f"""
    SELECT
        DATE(date) as date,
        name,
        SUM(number_of_sales) as number_of_sales,
        SUM(number_of_sales * pricing_unit) as revenue
    FROM national_sales
    WHERE date RLIKE '{current_date}'
    GROUP BY DATE(date), name
    """)
    result_by_date.write.mode("append") \
    .format("jdbc") \
    .option("url", f"jdbc:postgresql://localhost:5432/{database}") \
    .option("dbtable", dest_tbl) \
    .option("user", user) \
    .option("password", password) \
    .option("driver",  "org.postgresql.Driver") \
    .save()