In [2]:
import pandas as pd
import io

import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.0.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [3]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2021-05-02 09:20:27--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2021-05-02 09:20:29 (1.21 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Wine-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

**Load Data into Spark DataFrame**

In [22]:
from pyspark import SparkFiles
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/US_wine_dataset.csv"
spark.sparkContext.addFile(url)
wine_df = spark.read.csv(SparkFiles.get("US_wine_dataset.csv"), sep=",", header=True, inferSchema=True)
wine_df.show()

+------+----------+-------------------+-----------------+--------------------+------------------+--------------------+
|points|  province|           region_1|         region_2|               title|           variety|              winery|
+------+----------+-------------------+-----------------+--------------------+------------------+--------------------+
|    87|    Oregon|  Willamette Valley|Willamette Valley|Rainstorm 2013 Pi...|        Pinot Gris|           Rainstorm|
|    87|  Michigan|Lake Michigan Shore|             null|St. Julian 2013 R...|          Riesling|          St. Julian|
|    87|    Oregon|  Willamette Valley|Willamette Valley|Sweet Cheeks 2012...|        Pinot Noir|        Sweet Cheeks|
|    87|California|        Napa Valley|             Napa|Kirkland Signatur...|Cabernet Sauvignon|  Kirkland Signature|
|    87|California|   Alexander Valley|           Sonoma|Louis M. Martini ...|Cabernet Sauvignon|    Louis M. Martini|
|    87|California|      Central Coast|    Centr

In [23]:
wine_df.count()

54503

In [134]:
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/US_AL_cseat13t19.csv"
spark.sparkContext.addFile(url)
weather_df = spark.read.csv(SparkFiles.get("US_AL_cseat13t19.csv"), sep=",", header=True, inferSchema=True)
weather_df.show()

+-----------+-------------------+--------+---------+---------+----------+----+----+----+----+----+----+----+----+
|    STATION|               NAME|LATITUDE|LONGITUDE|ELEVATION|      DATE|AWND|PRCP|SN32|SNOW|SX32|TAVG|TMAX|TMIN|
+-----------+-------------------+--------+---------+---------+----------+----+----+----+----+----+----+----+----+
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-01|null|1.73|null|null|null|null|  51|  33|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-02|null| 0.0|null|null|null|null|  51|  33|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-03|null| 0.0|null|null|null|null|  51|  31|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-04|null| 0.0|null|null|null|null|  45|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-05|null| 0.0|null|null|null|null|  45|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|    167.6|2013-01-06|null| 0.0|null|

In [135]:
weather_df.count()

116849

**Create DataFrames to match tables**

In [144]:
wine_table_df = wine_df
wine_table_df.show()

+------+----------+-------------------+-----------------+--------------------+------------------+--------------------+
|points|  province|           region_1|         region_2|               title|           variety|              winery|
+------+----------+-------------------+-----------------+--------------------+------------------+--------------------+
|    87|    Oregon|  Willamette Valley|Willamette Valley|Rainstorm 2013 Pi...|        Pinot Gris|           Rainstorm|
|    87|  Michigan|Lake Michigan Shore|             null|St. Julian 2013 R...|          Riesling|          St. Julian|
|    87|    Oregon|  Willamette Valley|Willamette Valley|Sweet Cheeks 2012...|        Pinot Noir|        Sweet Cheeks|
|    87|California|        Napa Valley|             Napa|Kirkland Signatur...|Cabernet Sauvignon|  Kirkland Signature|
|    87|California|   Alexander Valley|           Sonoma|Louis M. Martini ...|Cabernet Sauvignon|    Louis M. Martini|
|    87|California|      Central Coast|    Centr

In [146]:
wine_table_df.printSchema()

root
 |-- points: integer (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- title: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)



In [136]:
weather_df.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- DATE: string (nullable = true)
 |-- AWND: double (nullable = true)
 |-- PRCP: double (nullable = true)
 |-- SN32: integer (nullable = true)
 |-- SNOW: double (nullable = true)
 |-- SX32: integer (nullable = true)
 |-- TAVG: integer (nullable = true)
 |-- TMAX: integer (nullable = true)
 |-- TMIN: integer (nullable = true)



In [137]:
from pyspark.sql.functions import col
from pyspark.sql.functions import to_date

weather_table_df = weather_df.select(col("STATION").alias("station"), col("NAME").alias("countyname"), col("LATITUDE").alias("latitude"), col("LONGITUDE").alias("longitude"), to_date("DATE", 'yyyy-MM-dd').alias("date"), col("TMAX").alias("tmax"), col("TMIN").alias("tmin"))
weather_table_df.show()

+-----------+-------------------+--------+---------+----------+----+----+
|    station|         countyname|latitude|longitude|      date|tmax|tmin|
+-----------+-------------------+--------+---------+----------+----+----+
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-01|  51|  33|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-02|  51|  33|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-03|  51|  31|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-04|  45|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-05|  45|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-06|  48|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-07|  48|  28|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-08|  61|  34|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-09|  59|  33|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-10|  59|  50|
|USC00012209|DECATUR 4 SE, AL US|34.57

In [138]:
weather_table_df.printSchema()

root
 |-- station: string (nullable = true)
 |-- countyname: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- date: date (nullable = true)
 |-- tmax: integer (nullable = true)
 |-- tmin: integer (nullable = true)



In [139]:
weather_table_df.count()

116849

In [143]:
# Remove rows that have at least 1 null value
weather_table_df = weather_table_df.dropna()
#weather_table_df.head(10)
weather_table_df.count()

82094

**Connect to the AWS RDS instance and write each DataFrame to its table**

In [131]:
# Configure settings for RDS
mode = "append"
#jdbc_url="jdbc:postgresql://<connection string>:5432/<database-name>"
jdbc_url="jdbc:postgresql://wine-final-project.czqkltznl3rl.us-east-2.rds.amazonaws.com/winedb"
#config = {"user":"postgres",
          #"password": "<password>",
          #"driver":"org.postgresql.Driver"}
config = {"user":"wineuser", 
          "password": "############",  
          "driver":"org.postgresql.Driver"}

In [147]:
# Write wine_table_df to table in RDS
wine_table_df.write.jdbc(url=jdbc_url, table='wine', mode=mode, properties=config)

In [132]:
# Write weather_table_df to table in RDS
weather_table_df.write.jdbc(url=jdbc_url, table='weather', mode=mode, properties=config)

In [150]:
# Read wine table from RDS
spark.read.jdbc(url=jdbc_url, table='wine', properties=config).limit(10).show()

+------+----------+--------------------+-----------------+--------------------+--------------------+---------------+
|points|  province|            region_1|         region_2|               title|             variety|         winery|
+------+----------+--------------------+-----------------+--------------------+--------------------+---------------+
|    96|California|Russian River Valley|           Sonoma|Rochioli 2014 Sou...|          Chardonnay|       Rochioli|
|    96|California|Russian River Valley|           Sonoma|Rochioli 2014 Swe...|          Chardonnay|       Rochioli|
|    95|California|Diamond Mountain ...|             Napa|J. Davies 2012 Ja...|  Cabernet Sauvignon|      J. Davies|
|    95|    Oregon|        Dundee Hills|Willamette Valley|Winderlea 2014 We...|          Pinot Noir|      Winderlea|
|    99|Washington|Columbia Valley (WA)|  Columbia Valley|Quilceda Creek 20...|  Cabernet Sauvignon| Quilceda Creek|
|    99|California|        Sonoma Coast|           Sonoma|Willia

In [151]:
# Read weather table from RDS
spark.read.jdbc(url=jdbc_url, table='weather', properties=config).limit(10).show()

+-----------+-------------------+--------+---------+----------+----+----+
|    station|         countyname|latitude|longitude|      date|tmax|tmin|
+-----------+-------------------+--------+---------+----------+----+----+
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-01|51.0|33.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-02|51.0|33.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-03|51.0|31.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-04|45.0|28.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-05|45.0|28.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-01|51.0|33.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-02|51.0|33.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-03|51.0|31.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-04|45.0|28.0|
|USC00012209|DECATUR 4 SE, AL US|34.57556|-86.93389|2013-01-05|45.0|28.0|
+-----------+-------------------+-----