In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("AmazonReviewsAnalysis1").getOrCreate()

In [3]:
from pyspark import SparkFiles
# Load in user_data.csv from S3 into a DataFrame
url = "https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Video_Games_v1_00.tsv.gz"
spark.sparkContext.addFile(url)

video_games_df = spark.read.csv(SparkFiles.get("amazon_reviews_us_Video_Games_v1_00.tsv.gz"), sep="\t", header=True)
video_games_df.show()

+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|marketplace|customer_id|     review_id|product_id|product_parent|       product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|     review_headline|         review_body|review_date|
+-----------+-----------+--------------+----------+--------------+--------------------+----------------+-----------+-------------+-----------+----+-----------------+--------------------+--------------------+-----------+
|         US|   12039526| RTIS3L2M1F5SM|B001CXYMFS|     737716809|Thrustmaster T-Fl...|     Video Games|          5|            0|          0|   N|                Y|an amazing joysti...|Used this for Eli...| 2015-08-31|
|         US|    9636577| R1ZV7R40OLHKD|B00M920ND6|     569686175|Tonsee 6 buttons ...|     Video Games|          5|    

In [4]:
# number of records in data set
video_games_df.count()

1785997

In [5]:
# clean df
video_games_df = video_games_df.dropna()
video_games_df = video_games_df.dropDuplicates()

In [6]:
# transform df to match the review_id_table schema
review_id_df = video_games_df.select("review_id", "customer_id", "product_id", "product_parent", "review_date")
review_id_df.show()

+--------------+-----------+----------+--------------+-----------+
|     review_id|customer_id|product_id|product_parent|review_date|
+--------------+-----------+----------+--------------+-----------+
|R1004PYTPK6ELD|   38281029|B00004WHWF|      89143877| 2002-01-23|
|R100EZDMO39LBZ|    1386752|B00OZBFUBY|     872540442| 2015-06-10|
|R1011I65X7RSKT|   41907229|B00006ISBU|     654790631| 2003-12-23|
|R101V84BKDOR1I|   12034223|B001KX5042|     279727821| 2014-08-08|
|R101VJUP2TFB3Y|   31532612|B009DL2TBA|     586138868| 2015-01-05|
|R1026A0A5F4D42|   30645349|B00FEKQZE6|     681125586| 2014-04-21|
|R102X78AGOZJY3|   20191662|B000ZK9QCS|     699333646| 2010-03-26|
|R104THW8CUUZXC|   20910843|B00DHF39EO|     100570084| 2015-06-15|
|R105YSA8EGSJ8D|    2435449|B0053B5RGI|     413365293| 2015-02-26|
|R106QTPT8YYZ1P|    4159081|B000TGB4UA|      17513844| 2015-04-22|
|R106UDOJ2QSWOR|   11343413|B00KVP78FE|     177244653| 2014-11-14|
|R109G94NZM2I7R|   22043357|B004XACA60|     432872195| 2012-03

In [7]:
review_id_df.dtypes

[('review_id', 'string'),
 ('customer_id', 'string'),
 ('product_id', 'string'),
 ('product_parent', 'string'),
 ('review_date', 'string')]

In [8]:
from pyspark.sql.types import IntegerType
review_id_df = review_id_df.withColumn("customer_id", review_id_df["customer_id"].cast(IntegerType()))
review_id_df = review_id_df.withColumn("product_parent", review_id_df["product_parent"].cast(IntegerType()))
from pyspark.sql.types import DateType
review_id_df = review_id_df.withColumn("review_date", review_id_df['review_date'].cast(DateType()))
review_id_df.dtypes

[('review_id', 'string'),
 ('customer_id', 'int'),
 ('product_id', 'string'),
 ('product_parent', 'int'),
 ('review_date', 'date')]

In [9]:
# transform df to match the products schema
products_df = video_games_df.select("product_id", "product_title")
products_df.show()

+----------+--------------------+
|product_id|       product_title|
+----------+--------------------+
|B00004WHWF|         Red Faction|
|B00OZBFUBY|USPRO&reg; PS3 Bl...|
|B00006ISBU|       NBA Live 2003|
|B001KX5042|Imagine:  Cheerle...|
|B009DL2TBA|PlayStation 3 500...|
|B00FEKQZE6|Skylanders SWAP F...|
|B000ZK9QCS|      God of War III|
|B00DHF39EO|The Elder Scrolls...|
|B0053B5RGI|           Pokemon X|
|B000TGB4UA|Tony Hawk's Provi...|
|B00KVP78FE|Sony PlayStation ...|
|B004XACA60|Battlefield 3 - L...|
|B000059TC8|PlayStation 2 DVD...|
|B00NNU07RU|    Xbox 360 Console|
|B0056BE0ZY|Jillian Michaels ...|
|B00BMFIXZG|  Assassin's Creed 4|
|B000MD3HQ4|Playstation 2 Mag...|
|B0016B28Y8|Call of Duty 4: M...|
|B0053BQU4G|             WWE '12|
|B00JUFSH9M|       Madden NFL 15|
+----------+--------------------+
only showing top 20 rows



In [10]:
products_df.dtypes

[('product_id', 'string'), ('product_title', 'string')]

In [11]:
products_df = products_df.dropDuplicates()

In [12]:
# transform df to match the customers schema
customers_df = video_games_df.groupby("customer_id").count()
customers_df.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|    5401088|    5|
|   37964777|   16|
|   20872523|    1|
|   44777937|    4|
|    2384511|    1|
|   41500251|    1|
|   34029693|    1|
|   40005173|    2|
|   12125016|    2|
|   21226328|    3|
|    6004768|    2|
|   14445274|    4|
|   41665035|    1|
|   16995171|    1|
|   25044130|    1|
|   11073960|    2|
|   38642776|    8|
|    6199838|    6|
|   19207466|    3|
|    1760318|    1|
+-----------+-----+
only showing top 20 rows



In [13]:
customers_df = customers_df.withColumnRenamed("count", "customer_count")
customers_df.show()

+-----------+--------------+
|customer_id|customer_count|
+-----------+--------------+
|    5401088|             5|
|   37964777|            16|
|   20872523|             1|
|   44777937|             4|
|    2384511|             1|
|   41500251|             1|
|   34029693|             1|
|   40005173|             2|
|   12125016|             2|
|   21226328|             3|
|    6004768|             2|
|   14445274|             4|
|   41665035|             1|
|   16995171|             1|
|   25044130|             1|
|   11073960|             2|
|   38642776|             8|
|    6199838|             6|
|   19207466|             3|
|    1760318|             1|
+-----------+--------------+
only showing top 20 rows



In [14]:
customers_df.dtypes

[('customer_id', 'string'), ('customer_count', 'bigint')]

In [15]:
customers_df = customers_df.withColumn("customer_id", customers_df["customer_id"].cast(IntegerType()))
customers_df = customers_df.withColumn("customer_count", customers_df["customer_count"].cast(IntegerType()))
customers_df.dtypes

[('customer_id', 'int'), ('customer_count', 'int')]

In [16]:
# transform df to match the vine_tables schema
vine_df = video_games_df.select("review_id", "star_rating", "helpful_votes", "total_votes", "vine")
vine_df.show()

+--------------+-----------+-------------+-----------+----+
|     review_id|star_rating|helpful_votes|total_votes|vine|
+--------------+-----------+-------------+-----------+----+
|R1004PYTPK6ELD|          5|            0|          0|   N|
|R100EZDMO39LBZ|          4|            0|          0|   N|
|R1011I65X7RSKT|          5|            2|          2|   N|
|R101V84BKDOR1I|          5|            0|          0|   N|
|R101VJUP2TFB3Y|          5|            0|          0|   N|
|R1026A0A5F4D42|          5|            5|          6|   N|
|R102X78AGOZJY3|          5|            0|          0|   N|
|R104THW8CUUZXC|          1|            2|          6|   N|
|R105YSA8EGSJ8D|          4|            0|          0|   N|
|R106QTPT8YYZ1P|          5|            1|          1|   N|
|R106UDOJ2QSWOR|          5|            0|          0|   N|
|R109G94NZM2I7R|          5|            0|          1|   N|
|R10A0VLQRM6JIA|          3|            3|          3|   N|
|R10ADC7LCXO3V2|          1|            

In [17]:
vine_df.dtypes

[('review_id', 'string'),
 ('star_rating', 'string'),
 ('helpful_votes', 'string'),
 ('total_votes', 'string'),
 ('vine', 'string')]

In [19]:
vine_df = vine_df.withColumn("star_rating", vine_df["star_rating"].cast(IntegerType()))
vine_df = vine_df.withColumn("helpful_votes", vine_df["helpful_votes"].cast(IntegerType()))
vine_df = vine_df.withColumn("total_votes", vine_df["total_votes"].cast(IntegerType()))
vine_df.dtypes

[('review_id', 'string'),
 ('star_rating', 'int'),
 ('helpful_votes', 'int'),
 ('total_votes', 'int'),
 ('vine', 'string')]

In [21]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://database-1.cskyuknpbcn2.us-east-2.rds.amazonaws.com:5432/amazon_reviews_db1"
config = {"user":"root", 
          "password": "applepie28", 
          "driver":"org.postgresql.Driver"}

In [22]:
# Write DataFrame to review_id_table table in RDS
review_id_df.write.jdbc(url=jdbc_url, table='review_id_table', mode=mode, properties=config)

Py4JJavaError: ignored

In [None]:
# Write DataFrame to products table in RDS
products_df.write.jdbc(url=jdbc_url, table='products', mode=mode, properties=config)

Py4JJavaError: ignored

In [None]:
# Write DataFrame to customers table in RDS
customers_df.write.jdbc(url=jdbc_url, table='customers', mode=mode, properties=config)

Py4JJavaError: ignored

In [23]:
# Write DataFrame to customers table in RDS
vine_df.write.jdbc(url=jdbc_url, table='vine_table', mode=mode, properties=config)

Py4JJavaError: ignored