In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark

findspark.init()

0% [Working]            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Wait                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com] [Conn                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 http://security.ubun

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2021-12-03 21:10:22--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar.10’


2021-12-03 21:10:23 (5.51 MB/s) - ‘postgresql-42.2.9.jar.10’ saved [914037/914037]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Outdoor_Product_Reviews").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [4]:
# Read in data from S3 Buckets
from pyspark import SparkFiles

url="https://aws-reviews-dwc.s3.amazonaws.com/amazon_reviews_us_Outdoors_v1_00.tsv"
spark.sparkContext.addFile(url)
outdoor_data_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Outdoors_v1_00.tsv"), sep="\t", header=True, inferSchema=True)

outdoor_data_df.show(5)
outdoor_data_df.count()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   18446823|R35T75OLUGHL5C|B000NV6H94|     110804376|Stearns Youth Boa...|        Outdoors|          4|            0|          0|   N|                Y|          Four Stars|          GOOD VALUE| 2015-08-31|
|         US|   13724367|R2BV735O46BN33|B000IN0W3Y|     624096774|Primal Wear Men's...|        Outdoors|          5|    

2302401

In [5]:
# Drop null values

dropna_df = outdoor_data_df.dropna()
dropna_df.show(5)
dropna_df.count()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   18446823|R35T75OLUGHL5C|B000NV6H94|     110804376|Stearns Youth Boa...|        Outdoors|          4|            0|          0|   N|                Y|          Four Stars|          GOOD VALUE| 2015-08-31|
|         US|   13724367|R2BV735O46BN33|B000IN0W3Y|     624096774|Primal Wear Men's...|        Outdoors|          5|    

2302174

In [6]:
# Load in a sql function to use columns
from pyspark.sql.functions import col

# Filter for only columns with active users
cleaned_df = dropna_df.select(['customer_id','review_id','product_id','product_parent','product_title','star_rating','helpful_votes','total_votes','vine','review_date'])
cleaned_df.show(5)

+-----------+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------+
|customer_id|     review_id|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|review_date|
+-----------+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------+
|   18446823|R35T75OLUGHL5C|B000NV6H94|     110804376|Stearns Youth Boa...|          4|            0|          0|   N| 2015-08-31|
|   13724367|R2BV735O46BN33|B000IN0W3Y|     624096774|Primal Wear Men's...|          5|            0|          0|   N| 2015-08-31|
|   51001958|R2NBEUGPQQGXP1|B008RBJXFM|     278970944|Osprey Hydraulics...|          4|            0|          0|   N| 2015-08-31|
|   32866903|R17LLAOJ8ITK0S|B00FK8WUQY|     312877650|CamelBak eddy .75...|          3|            1|          1|   N| 2015-08-31|
|   30907790|R39PEQBT5ISEF4|B00EZA3VW0|     305567912|Children Black Re...|        

In [7]:
# count of each customer

count_df = cleaned_df.groupBy('customer_id').count()
count_df.show(5)
count_df.count()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|   43679767|    1|
|   32024654|    1|
|   52913169|    1|
|   24297214|    1|
|   26096454|    1|
+-----------+-----+
only showing top 5 rows



1516273

In [8]:
# join count_df with cleaned_df

joined_df= count_df.join(cleaned_df, on='customer_id', how='inner')
joined_df.show(5)
joined_df.count()

+-----------+-----+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------+
|customer_id|count|     review_id|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|review_date|
+-----------+-----+--------------+----------+--------------+--------------------+-----------+-------------+-----------+----+-----------+
|      10015|    1| R5UXI05724HJW|B00C2MHNJK|     237930913|SecurityIng Water...|          4|            2|          3|   N| 2013-11-21|
|      10124|    1|R2ZWSOB8LVE9QL|B003V4OHEC|     273193187|McNett Tactical U...|          4|            0|          0|   N| 2013-08-03|
|      10227|    1| RBCTEJGE7Q7RF|B001Q70A0G|     218826182|LEDwholesalers 51...|          5|            0|          0|   N| 2013-07-09|
|      10293|    1| RGI07F2LZ59K0|B00HK9DQPK|     807801717|Hot Popular Women...|          5|            0|          0|   N| 2014-12-10|
|      10348|    1|R28ATIU912MC9S|B00P5ZX

2302174

In [9]:
from pyspark.sql.functions import *

dt_df = joined_df.select('review_id','review_date',to_date('review_date').alias('review_date_cv'))
dt_df = dt_df.drop('review_date')
dt_df = dt_df.withColumnRenamed('review_date_cv','review_date')
dt_df.show(5)
dt_df.count()
dt_df.dtypes

+--------------+-----------+
|     review_id|review_date|
+--------------+-----------+
| R5UXI05724HJW| 2013-11-21|
|R2PEH4J9FWDFGX| 2013-09-22|
|R2ZWSOB8LVE9QL| 2013-08-03|
|R3UOV0UDMJVBXA| 2015-03-01|
|R2LQ5ZH5TV67ZC| 2015-08-25|
+--------------+-----------+
only showing top 5 rows



[('review_id', 'string'), ('review_date', 'date')]

In [10]:
dt_df.count()

2302174

In [11]:
# drop string date column

joined_df = joined_df.drop('review_date')
joined_df.show(1)

+-----------+-----+-------------+----------+--------------+--------------------+-----------+-------------+-----------+----+
|customer_id|count|    review_id|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|
+-----------+-----+-------------+----------+--------------+--------------------+-----------+-------------+-----------+----+
|      10015|    1|R5UXI05724HJW|B00C2MHNJK|     237930913|SecurityIng Water...|          4|            2|          3|   N|
+-----------+-----+-------------+----------+--------------+--------------------+-----------+-------------+-----------+----+
only showing top 1 row



In [12]:
# df with converted time

joined_dt_df = dt_df.join(joined_df, on='review_id', how='inner')
joined_dt_df.show(5)
joined_dt_df.count()

+--------------+-----------+-----------+-----+----------+--------------+--------------------+-----------+-------------+-----------+----+
|     review_id|review_date|customer_id|count|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-----------+-----+----------+--------------+--------------------+-----------+-------------+-----------+----+
|R10000R3GJ9K4W| 2014-11-23|   33191302|    1|B000QSXMME|     731839458|Planet Bike Blink...|          4|            0|          0|   N|
|R1000EAOW8MDYK| 2015-04-28|   37876758|    1|B00FAW4O0A|     863230011|SoundAsleep Dream...|          5|            0|          0|   N|
|R1000MO731G969| 2015-01-01|   37664296|    3|B000FK7C2E|     907264026|Razor A Kick Scooter|          4|            0|          0|   N|
|R1001AUTMLT6LQ| 2014-10-20|   19545598|   11|B00CYTNWLE|     605045985|Dakine Kainui Coi...|          5|            0|          0|   N|
|R1001RHSXTMO9I| 2014-08-28|   25177100| 

2302174

In [13]:
# drop duplicates

joined_dt_df = joined_dt_df.dropDuplicates(['review_id'])
joined_dt_df.show(5)
joined_dt_df.count()

+--------------+-----------+-----------+-----+----------+--------------+--------------------+-----------+-------------+-----------+----+
|     review_id|review_date|customer_id|count|product_id|product_parent|       product_title|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-----------+-----+----------+--------------+--------------------+-----------+-------------+-----------+----+
|R10000R3GJ9K4W| 2014-11-23|   33191302|    1|B000QSXMME|     731839458|Planet Bike Blink...|          4|            0|          0|   N|
|R1000EAOW8MDYK| 2015-04-28|   37876758|    1|B00FAW4O0A|     863230011|SoundAsleep Dream...|          5|            0|          0|   N|
|R1000MO731G969| 2015-01-01|   37664296|    3|B000FK7C2E|     907264026|Razor A Kick Scooter|          4|            0|          0|   N|
|R1001AUTMLT6LQ| 2014-10-20|   19545598|   11|B00CYTNWLE|     605045985|Dakine Kainui Coi...|          5|            0|          0|   N|
|R1001RHSXTMO9I| 2014-08-28|   25177100| 

2302174

In [14]:
joined_dt_df.count()

2302174

In [15]:
# review id table

review_id_df = joined_dt_df.select(['review_id','customer_id','product_id','product_parent','review_date'])
review_id_df.show(5)

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R10000R3GJ9K4W|   33191302|B000QSXMME|     731839458| 2014-11-23|
|R1000EAOW8MDYK|   37876758|B00FAW4O0A|     863230011| 2015-04-28|
|R1000L9ACY9T7U|   29734993|B00OCEBZ86|     515483530| 2015-04-17|
|R1000MO731G969|   37664296|B000FK7C2E|     907264026| 2015-01-01|
|R1001AUTMLT6LQ|   19545598|B00CYTNWLE|     605045985| 2014-10-20|
+--------------+-----------+----------+--------------+-----------+
only showing top 5 rows



In [24]:
# products table

products_df = joined_dt_df.dropDuplicates(['product_id'])
products_df = products_df.select(['product_id','product_title'])
products_df.show(5)
products_df.count()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|0316287229|Bike Pedals for K...|
|059445039X|Blackwatch Plaid ...|
|0743609972|Maptech ChartKit®...|
|078341384X|Three Sisters Wil...|
|097794350X|Voyageur Map Numb...|
+----------+--------------------+
only showing top 5 rows



391700

In [30]:
# customers table

cust_df = joined_dt_df.withColumnRenamed('count','customer_count')
cust_df = cust_df.dropDuplicates(['customer_id'])
cust_df.count()
customers_df = cust_df.select(['customer_id','customer_count'])
customers_df.show(5)

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|   50429951|             1|
|    8382766|             1|
|   20368791|             1|
|   46350505|             1|
|   15000890|             2|
+-----------+--------------+
only showing top 5 rows



In [37]:
# vine table

vine_df = joined_dt_df.filter('vine == "Y"')

vine_df = vine_df.select(['review_id','star_rating','helpful_votes','total_votes','vine'])
vine_df.show(5)

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1017MH588Q8HW|          5|            0|          2|   Y|
|R102Y50B81GJV2|          5|           10|         11|   Y|
|R106EPOYHY5FBL|          3|            0|          1|   Y|
|R10CAGCBCBAYGA|          4|            0|          0|   Y|
|R10CMYF15NQZXU|          4|            2|          3|   Y|
+--------------+-----------+-------------+-----------+----+
only showing top 5 rows



Postgres Setup

In [41]:
# Configure settings for RDS
import config # upload config.py with db function containing db_pass variable to colab

mode = "append"
jdbc_url="jdbc:postgresql://mypostgresdb.cva7mvzdbfbz.us-east-2.rds.amazonaws.com:5432/aws_reviews"
config = {"user":"root", 
          "password": config.db(), 
          "driver":"org.postgresql.Driver"}

In [20]:
# Write DataFrame to review_id_table in RDS

review_id_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

In [25]:
# Write DataFrame to products table in RDS

products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

In [31]:
# Write DataFrame to customers table in RDS

customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

In [43]:
# Write DataFrame to vine_table in RDS

vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)