In [None]:
# for general use
import os

# define spark parameters (version info can be found at http://www.apache.org/dist/spark/)
spark_version = 'spark-3.2.2'
os.environ['SPARK_VERSION'] = spark_version

# install spark and java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# set the environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# initialize spark
import findspark
findspark.init()

In [None]:
# install jdbc
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

In [None]:
# instantiate the spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("CloudETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [None]:
# read in data from S3 bucket
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Home_Improvement_v1_00.tsv.gz"), sep = "\t", header = True)

# show the datafarme
df.show()

In [None]:
# count the number of rows in the raw dataframe
row_count = df.count()
print(f"There are {row_count} rows in the raw dataframe.")
# There are 2634781 rows in the raw dataframe.

In [None]:
review_id_table_df = df.select(["review_id", "customer_id", "product_id", "product_parent", "review_date"])
review_id_table_df.show()

In [None]:
products_df = df.select(["product_id", "product_title"])
products_df.show()

In [None]:
customers_df = df.groupBy(["customer_id"]).count().withColumnRenamed("count", "customer_count")
customers_df.show()

In [None]:
vine_table_df = df.select(["review_id", "star_rating", "helpful_votes", "total_votes", "vine"])
vine_table_df.show()

In [None]:
rds_endpoint = "big-data-challenge-db.coojj4dzvalz.us-east-2.rds.amazonaws.com"
rds_password = "imperator.1991"
rds_dbname = "reviews_db"
rds_username = "sneubauer"
rds_port = 5432

In [None]:
# connection string
jdbc_url = f"jdbc:postgresql://{rds_endpoint}:{rds_port}/{rds_dbname}"

# config parameters
config = {
    "user": f"{rds_username}",
    "password": f"{rds_password}",
    "driver": "org.postgresql.Driver"
}

my_mode = "overwrite" # "append"

In [None]:
review_id_table_df.write.jdbc(url = jdbc_url, table = "review_id_table", mode = my_mode, properties = config)
products_df.write.jdbc(url = jdbc_url, table = "products", mode = my_mode, properties = config)
customers_df.write.jdbc(url = jdbc_url, table = "customers", mode = my_mode, properties = config)
vine_table_df.write.jdbc(url = jdbc_url, table = "vine_table", mode = my_mode, properties = config)