In [1]:
import pandas as pd
import io

# Import RDS password
from config import rds_pwd

import os
# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
spark_version = 'spark-3.0.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:8 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Hit:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:10 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:12 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:13 http://ppa.launchpad.ne

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2021-05-13 00:55:28--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar.1’


2021-05-13 00:55:28 (10.3 MB/s) - ‘postgresql-42.2.16.jar.1’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Wine-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

**Load Data into Spark DataFrame**

In [4]:
from pyspark import SparkFiles
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/wine_reviews_cleaned.csv"
spark.sparkContext.addFile(url)
wine_df = spark.read.csv(SparkFiles.get("wine_reviews_cleaned.csv"), sep=",", header=True, inferSchema=True)
wine_df.show()

+---+-------+------+------------------+-------------------+--------------------+
|_c0|country|points|           variety|             winery|       winery_search|
+---+-------+------+------------------+-------------------+--------------------+
|  3|     US|    87|          Riesling|         St. Julian|St. Julian winery US|
|  4|     US|    87|        Pinot Noir|       Sweet Cheeks|Sweet Cheeks wine...|
| 10|     US|    87|Cabernet Sauvignon| Kirkland Signature|Kirkland Signatur...|
| 12|     US|    87|Cabernet Sauvignon|   Louis M. Martini|Louis M. Martini ...|
| 14|     US|    87|        Chardonnay|           Mirassou|  Mirassou winery US|
| 15|Germany|    87|          Riesling|    Richard Böcking|Richard Böcking w...|
| 21|     US|    87|        Pinot Noir|            Acrobat|   Acrobat winery US|
| 23|     US|    87|            Merlot|            Bianchi|   Bianchi winery US|
| 25|     US|    87|        Pinot Noir|Castello di Amorosa|Castello di Amoro...|
| 34|     US|    86|   Sauvi

In [5]:
wine_df.count()

55461

In [6]:
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/winery_weather.csv"
spark.sparkContext.addFile(url)
weather_df = spark.read.csv(SparkFiles.get("winery_weather.csv"), sep=",", header=True, inferSchema=True)
weather_df.show()

+---+--------------------+----------------------------+--------------------------+--------------------+--------------------------------+
|_c0|              Winery|Average Temperature (Kelvin)|Average Air Pressure (hPa)|Average Humidity (%)|Average Daily Precipitation (mm)|
+---+--------------------+----------------------------+--------------------------+--------------------+--------------------------------+
|  0|St. Julian winery US|          269.63750000000005|        1018.1500000000001|   79.24000000000001|                          0.0725|
|  1|Sweet Cheeks wine...|                     276.025|                 1019.8875|   85.45249999999999|            0.057499999999999996|
|  2|Kirkland Signatur...|                     277.375|                 1020.0125|   79.19749999999999|             0.23500000000000001|
|  3|Louis M. Martini ...|          281.03249999999997|                    1018.5|              79.155|                          0.0775|
|  4|Richard Böcking w...|               

In [7]:
weather_df.count()

7892

In [8]:
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/uscounty_weather.csv"
spark.sparkContext.addFile(url)
uscounty_weather_df = spark.read.csv(SparkFiles.get("uscounty_weather.csv"), sep=",", header=True, inferSchema=True)
uscounty_weather_df.show()

+---+----+--------+---------+------------+--------+-------------+------------+----------------------------+--------------------------+--------------------+--------------------------------+
|_c0| zip|     lat|      lng|        city|state_id|   state_name| county_name|Average Temperature (Kelvin)|Average Air Pressure (hPa)|Average Humidity (%)|Average Daily Precipitation (mm)|
+---+----+--------+---------+------------+--------+-------------+------------+----------------------------+--------------------------+--------------------+--------------------------------+
|  0|1001|42.06259|-72.62589|      Agawam|      MA|Massachusetts|     Hampden|                     270.805|                 1016.1225|              72.215|                           0.125|
|  1|1002|42.37492| -72.4621|     Amherst|      MA|Massachusetts|   Hampshire|                     270.805|                 1016.1225|              72.215|                           0.125|
|  2|1005|42.42017|-72.10615|       Barre|      MA|Mass

In [9]:
uscounty_weather_df.count()

1797

In [6]:
from pyspark import SparkFiles
url = "https://green-team-wine.s3.us-east-2.amazonaws.com/weather_mapscaled.csv"
spark.sparkContext.addFile(url)
weather_mapscaled_df = spark.read.csv(SparkFiles.get("weather_mapscaled.csv"), sep=",", header=True, inferSchema=True)
weather_mapscaled_df.show()

+--------------------+----------+------------+----------------------------+--------------------------+--------------------+--------------------------------+-------+------+------------------+------------------+
|  winery_search_term|  latitude|   longitude|Average Temperature (Kelvin)|Average Air Pressure (hPa)|Average Humidity (%)|Average Daily Precipitation (mm)|country|points|           variety|            winery|
+--------------------+----------+------------+----------------------------+--------------------------+--------------------+--------------------------------+-------+------+------------------+------------------+
|St. Julian winery US|42.2122513| -85.8917127|          269.63750000000005|                   1018.15|   79.24000000000002|                          0.0725|     US|    87|          Riesling|        St. Julian|
|St. Julian winery US|42.2122513| -85.8917127|          269.63750000000005|                   1018.15|   79.24000000000002|                          0.0725|    

In [7]:
weather_mapscaled_df.count()

52906

**Create DataFrames to match tables**

In [10]:
#Select only the columns that will be stored into the RDS database

from pyspark.sql.functions import col

wine_table_df = wine_df.select(col("country").alias("country"), col("points").alias("points"), col("variety").alias("variety"), col("winery").alias("winery"), col("winery_search").alias("winery_country")).where(col("country").isNotNull())
wine_table_df.show()

+-------+------+------------------+-------------------+--------------------+
|country|points|           variety|             winery|      winery_country|
+-------+------+------------------+-------------------+--------------------+
|     US|    87|          Riesling|         St. Julian|St. Julian winery US|
|     US|    87|        Pinot Noir|       Sweet Cheeks|Sweet Cheeks wine...|
|     US|    87|Cabernet Sauvignon| Kirkland Signature|Kirkland Signatur...|
|     US|    87|Cabernet Sauvignon|   Louis M. Martini|Louis M. Martini ...|
|     US|    87|        Chardonnay|           Mirassou|  Mirassou winery US|
|Germany|    87|          Riesling|    Richard Böcking|Richard Böcking w...|
|     US|    87|        Pinot Noir|            Acrobat|   Acrobat winery US|
|     US|    87|            Merlot|            Bianchi|   Bianchi winery US|
|     US|    87|        Pinot Noir|Castello di Amorosa|Castello di Amoro...|
|     US|    86|   Sauvignon Blanc|            Envolve|   Envolve winery US|

In [11]:
wine_table_df.printSchema()

root
 |-- country: string (nullable = true)
 |-- points: integer (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)
 |-- winery_country: string (nullable = true)



In [12]:
wine_table_df.count()

55446

In [13]:
weather_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Winery: string (nullable = true)
 |-- Average Temperature (Kelvin): double (nullable = true)
 |-- Average Air Pressure (hPa): double (nullable = true)
 |-- Average Humidity (%): double (nullable = true)
 |-- Average Daily Precipitation (mm): double (nullable = true)



In [14]:
#Select only the columns that will be stored into the RDS database

from pyspark.sql.functions import col

weather_table_df = weather_df.select(col("Winery").alias("winery_search"), col("Average Temperature (Kelvin)").alias("avg_temp"), col("Average Air Pressure (hPa)").alias("avg_airp"), col("Average Humidity (%)").alias("avg_humid"), col("Average Daily Precipitation (mm)").alias("avg_precip"))
weather_table_df.show()

+--------------------+------------------+------------------+-----------------+--------------------+
|       winery_search|          avg_temp|          avg_airp|        avg_humid|          avg_precip|
+--------------------+------------------+------------------+-----------------+--------------------+
|St. Julian winery US|269.63750000000005|1018.1500000000001|79.24000000000001|              0.0725|
|Sweet Cheeks wine...|           276.025|         1019.8875|85.45249999999999|0.057499999999999996|
|Kirkland Signatur...|           277.375|         1020.0125|79.19749999999999| 0.23500000000000001|
|Louis M. Martini ...|281.03249999999997|            1018.5|           79.155|              0.0775|
|Richard Böcking w...|            275.71|         1014.9425|           92.215|               0.135|
|   Bianchi winery US|           279.685|           1018.23|            73.42|               0.025|
|Castello di Amoro...|281.03249999999997|            1018.5|           79.155|              0.0775|


In [15]:
weather_table_df.count()

7892

In [16]:
#Select only the columns that will be stored into the RDS database

from pyspark.sql.functions import col

uscounty_weather_table_df = uscounty_weather_df.select(col("zip").alias("zip"), col("lat").alias("lat"), col("lng").alias("lng"), col("city").alias("city"), col("state_id").alias("state_id"), col("state_name").alias("state_name"), col("county_name").alias("county_name"), col("Average Temperature (Kelvin)").alias("avg_temp"), col("Average Air Pressure (hPa)").alias("avg_airp"), col("Average Humidity (%)").alias("avg_humid"), col("Average Daily Precipitation (mm)").alias("avg_precip"))
uscounty_weather_table_df.show()

+----+--------+---------+------------+--------+-------------+------------+--------+---------+---------+----------+
| zip|     lat|      lng|        city|state_id|   state_name| county_name|avg_temp| avg_airp|avg_humid|avg_precip|
+----+--------+---------+------------+--------+-------------+------------+--------+---------+---------+----------+
|1001|42.06259|-72.62589|      Agawam|      MA|Massachusetts|     Hampden| 270.805|1016.1225|   72.215|     0.125|
|1002|42.37492| -72.4621|     Amherst|      MA|Massachusetts|   Hampshire| 270.805|1016.1225|   72.215|     0.125|
|1005|42.42017|-72.10615|       Barre|      MA|Massachusetts|   Worcester| 270.725|  1014.96|  71.7525|     0.135|
|1029|42.19632|-73.04836|   East Otis|      MA|Massachusetts|   Berkshire| 270.805|1016.1225|   72.215|     0.125|
|1054|42.47363|-72.48746|    Leverett|      MA|Massachusetts|    Franklin| 270.805|1016.1225|   72.215|     0.125|
|1431|42.67652|-71.82322|       Ashby|      MA|Massachusetts|   Middlesex| 270.6

In [17]:
uscounty_weather_table_df.printSchema()

root
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- lng: double (nullable = true)
 |-- city: string (nullable = true)
 |-- state_id: string (nullable = true)
 |-- state_name: string (nullable = true)
 |-- county_name: string (nullable = true)
 |-- avg_temp: double (nullable = true)
 |-- avg_airp: double (nullable = true)
 |-- avg_humid: double (nullable = true)
 |-- avg_precip: double (nullable = true)



In [22]:
uscounty_weather_table_df.count()

1797

In [8]:
#Select only the columns that will be stored into the RDS database

from pyspark.sql.functions import col

weather_mapscaled_df = weather_mapscaled_df.select(col("winery_search_term").alias("winery_search_term"), col("latitude").alias("lat"), col("longitude").alias("lng"), col("Average Temperature (Kelvin)").alias("avg_temp"), col("Average Air Pressure (hPa)").alias("avg_airp"), col("Average Humidity (%)").alias("avg_humid"), col("Average Daily Precipitation (mm)").alias("avg_precip"), col("country").alias("country"), col("points").alias("points"), col("variety").alias("variety"), col("winery").alias("winery"))
weather_mapscaled_df.show()

+--------------------+----------+------------+------------------+---------+-----------------+----------+-------+------+------------------+------------------+
|  winery_search_term|       lat|         lng|          avg_temp| avg_airp|        avg_humid|avg_precip|country|points|           variety|            winery|
+--------------------+----------+------------+------------------+---------+-----------------+----------+-------+------+------------------+------------------+
|St. Julian winery US|42.2122513| -85.8917127|269.63750000000005|  1018.15|79.24000000000002|    0.0725|     US|    87|          Riesling|        St. Julian|
|St. Julian winery US|42.2122513| -85.8917127|269.63750000000005|  1018.15|79.24000000000002|    0.0725|     US|    86|          Riesling|        St. Julian|
|St. Julian winery US|42.2122513| -85.8917127|269.63750000000005|  1018.15|79.24000000000002|    0.0725|     US|    86|          Riesling|        St. Julian|
|Sweet Cheeks wine...|43.9566698|-123.2791353|      

In [9]:
weather_mapscaled_df.count()

52906

**Connect to the AWS RDS instance and write each DataFrame to its table**

In [11]:
# Configure settings for RDS
mode = "append"
#jdbc_url="jdbc:postgresql://<connection string>:5432/<database-name>"
jdbc_url="jdbc:postgresql://wine-final-project.czqkltznl3rl.us-east-2.rds.amazonaws.com/winedb"
#config = {"user":"postgres",
          #"password": "<password>",
          #"driver":"org.postgresql.Driver"}
config = {"user":"wineuser",
          "password": rds_pwd,
          "driver":"org.postgresql.Driver"}

In [20]:
wine_table_df

DataFrame[country: string, points: int, variety: string, winery: string, winery_country: string]

In [21]:
# Write wine_table_df to table in RDS
wine_table_df.write.jdbc(url=jdbc_url, table='wine', mode=mode, properties=config)

In [23]:
# Write weather_table_df to table in RDS
weather_table_df.write.jdbc(url=jdbc_url, table='weather', mode=mode, properties=config)

In [24]:
# Write uscounty_weather_table_df to table in RDS
uscounty_weather_table_df.write.jdbc(url=jdbc_url, table='uscounty', mode=mode, properties=config)

In [12]:
# Write weather_mapscaled_df to table in RDS
weather_mapscaled_df.write.jdbc(url=jdbc_url, table='weather_mapscaled', mode=mode, properties=config)

In [8]:
# Read wine table from RDS
spark.read.jdbc(url=jdbc_url, table='wine', properties=config).limit(10).show()

+-------+------+------------------+-------------------+--------------------+
|country|points|           variety|             winery|      winery_country|
+-------+------+------------------+-------------------+--------------------+
|     US|    87|          Riesling|         St. Julian|St. Julian winery US|
|     US|    87|        Pinot Noir|       Sweet Cheeks|Sweet Cheeks wine...|
|     US|    87|Cabernet Sauvignon| Kirkland Signature|Kirkland Signatur...|
|     US|    87|Cabernet Sauvignon|   Louis M. Martini|Louis M. Martini ...|
|     US|    87|        Chardonnay|           Mirassou|  Mirassou winery US|
|Germany|    87|          Riesling|    Richard Böcking|Richard Böcking w...|
|     US|    87|        Pinot Noir|            Acrobat|   Acrobat winery US|
|     US|    87|            Merlot|            Bianchi|   Bianchi winery US|
|     US|    87|        Pinot Noir|Castello di Amorosa|Castello di Amoro...|
|     US|    86|   Sauvignon Blanc|            Envolve|   Envolve winery US|

In [26]:
# Read weather table from RDS
spark.read.jdbc(url=jdbc_url, table='weather', properties=config).limit(10).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|       winery_search|            avg_temp|            avg_airp|           avg_humid|          avg_precip|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|St. Julian winery US|269.6375000000000...|1018.150000000000...|79.24000000000000...|0.072500000000000000|
|Sweet Cheeks wine...|276.0250000000000...|1019.887500000000...|85.45250000000000...|0.057500000000000000|
|Kirkland Signatur...|277.3750000000000...|1020.012500000000...|79.19750000000000...|0.235000000000000000|
|Louis M. Martini ...|281.0325000000000...|1018.500000000000...|79.15500000000000...|0.077500000000000000|
|Richard Böcking w...|275.7100000000000...|1014.942500000000...|92.21500000000000...|0.135000000000000000|
|   Bianchi winery US|279.6850000000000...|1018.230000000000...|73.42000000000000...|0.025000000000000000|
|Castello di Amoro...|281.03250000000

In [27]:
# Read uscounty table from RDS
spark.read.jdbc(url=jdbc_url, table='uscounty', properties=config).limit(10).show()

+----+---+---+----------+--------+-------------+-----------+--------------------+--------------------+--------------------+--------------------+
| zip|lat|lng|      city|state_id|   state_name|county_name|            avg_temp|            avg_airp|           avg_humid|          avg_precip|
+----+---+---+----------+--------+-------------+-----------+--------------------+--------------------+--------------------+--------------------+
|1001| 42|-73|    Agawam|      MA|Massachusetts|    Hampden|270.8050000000000...|1016.122500000000...|72.21500000000000...|0.125000000000000000|
|1002| 42|-72|   Amherst|      MA|Massachusetts|  Hampshire|270.8050000000000...|1016.122500000000...|72.21500000000000...|0.125000000000000000|
|1005| 42|-72|     Barre|      MA|Massachusetts|  Worcester|270.7250000000000...|1014.960000000000...|71.75250000000000...|0.135000000000000000|
|1029| 42|-73| East Otis|      MA|Massachusetts|  Berkshire|270.8050000000000...|1016.122500000000...|72.21500000000000...|0.12500

In [13]:
# Read weather_mapscaled table from RDS
spark.read.jdbc(url=jdbc_url, table='weather_mapscaled', properties=config).limit(10).show()

+--------------------+---+----+--------------------+--------------------+--------------------+--------------------+-------+------+---------------+-------------+
|  winery_search_term|lat| lng|            avg_temp|            avg_airp|           avg_humid|          avg_precip|country|points|        variety|       winery|
+--------------------+---+----+--------------------+--------------------+--------------------+--------------------+-------+------+---------------+-------------+
|   LaZarre winery US| 36|-121|279.6850000000000...|1018.230000000000...|73.42000000000000...|0.025000000000000000|     US|    88|     Chardonnay|      LaZarre|
|Negretti winery I...| 45|   8|275.6025000000000...|1016.337500000000...|79.04000000000000...|0.070000000000000000|  Italy|    88|     Chardonnay|     Negretti|
|Pico Maccario win...| 45|   8|280.6950000000000...|1017.880000000000...|70.50250000000000...|0.147500000000000000|  Italy|    85|     Chardonnay|Pico Maccario|
|Farnese winery Italy| 43|  12|278

In [28]:
select_sql = "(SELECT winery FROM wine) AS wineryalias"
spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config).show()

+-------------------+
|             winery|
+-------------------+
|         St. Julian|
|       Sweet Cheeks|
| Kirkland Signature|
|   Louis M. Martini|
|           Mirassou|
|    Richard Böcking|
|            Acrobat|
|            Bianchi|
|Castello di Amorosa|
|            Envolve|
|              Erath|
|Feudi del Pisciotto|
|    Hawkins Cellars|
|        Robert Hall|
|           Sundance|
|   The White Knight|
|              Trump|
|          RustRidge|
|          Souverain|
|      Tres Palacios|
+-------------------+
only showing top 20 rows



In [29]:
select_sql = "(SELECT count(DISTINCT winery) FROM wine) AS wineryalias"
spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config).show()

+-----+
|count|
+-----+
| 8614|
+-----+



In [28]:
select_sql = "(SELECT winery, country, points, variety, winery_search, avg_temp, avg_airp, avg_humid, avg_precip FROM wine AS wi, weather AS we WHERE wi.winery_country = we.winery_search) AS wineryalias"
spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config).show()

+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             winery|country|points|           variety|       winery_search|            avg_temp|            avg_airp|           avg_humid|          avg_precip|
+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         St. Julian|     US|    87|          Riesling|St. Julian winery US|269.6375000000000...|1018.150000000000...|79.24000000000000...|0.072500000000000000|
|       Sweet Cheeks|     US|    87|        Pinot Noir|Sweet Cheeks wine...|276.0250000000000...|1019.887500000000...|85.45250000000000...|0.057500000000000000|
| Kirkland Signature|     US|    87|Cabernet Sauvignon|Kirkland Signatur...|277.3750000000000...|1020.012500000000...|79.19750000000000...|0.235000000000000000|
|   Louis M. Martini|     US|    8

In [None]:
#select_sql = "(SELECT winery, country, points, variety, winery_search, avg_temp, avg_airp, avg_humid, avg_precip FROM wine LEFT JOIN weather ON (wine.winery_country = weather.winery_search)) AS wineryalias"
#spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config).show()

In [33]:
#Join tables wine and weather
select_sql = "(SELECT winery, country, points, variety, winery_search, avg_temp, avg_airp, avg_humid, avg_precip FROM wine INNER JOIN weather ON (wine.winery_country = weather.winery_search)) AS wineryalias"
spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config).show()

+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             winery|country|points|           variety|       winery_search|            avg_temp|            avg_airp|           avg_humid|          avg_precip|
+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         St. Julian|     US|    87|          Riesling|St. Julian winery US|269.6375000000000...|1018.150000000000...|79.24000000000000...|0.072500000000000000|
|       Sweet Cheeks|     US|    87|        Pinot Noir|Sweet Cheeks wine...|276.0250000000000...|1019.887500000000...|85.45250000000000...|0.057500000000000000|
| Kirkland Signature|     US|    87|Cabernet Sauvignon|Kirkland Signatur...|277.3750000000000...|1020.012500000000...|79.19750000000000...|0.235000000000000000|
|   Louis M. Martini|     US|    8

In [34]:
select_sql = "(SELECT winery, country, points, variety, winery_search, avg_temp, avg_airp, avg_humid, avg_precip FROM wine INNER JOIN weather ON (wine.winery_country = weather.winery_search)) AS wineryalias"
machine_learning_table_df = spark.read.jdbc(url=jdbc_url, table=select_sql, properties=config)

In [35]:
# Write machine_learning_table_df to table in RDS
machine_learning_table_df.write.jdbc(url=jdbc_url, table='machine', mode=mode, properties=config)

In [36]:
# Read machine table from RDS
spark.read.jdbc(url=jdbc_url, table='machine', properties=config).limit(10).show()

+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|             winery|country|points|           variety|       winery_search|            avg_temp|            avg_airp|           avg_humid|          avg_precip|
+-------------------+-------+------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         St. Julian|     US|    87|          Riesling|St. Julian winery US|269.6375000000000...|1018.150000000000...|79.24000000000000...|0.072500000000000000|
|       Sweet Cheeks|     US|    87|        Pinot Noir|Sweet Cheeks wine...|276.0250000000000...|1019.887500000000...|85.45250000000000...|0.057500000000000000|
| Kirkland Signature|     US|    87|Cabernet Sauvignon|Kirkland Signatur...|277.3750000000000...|1020.012500000000...|79.19750000000000...|0.235000000000000000|
|   Louis M. Martini|     US|    8