# Convert parquet file to postgres table

In [None]:
# get driver to postgres
#!spark-shell --packages org.postgresql:postgresql:42.2.18 

In [1]:
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.types as T
import psycopg2

In [2]:
spark = SparkSession \
    .builder \
    .appName("spark_to_postgres") \
    .getOrCreate()

In [3]:
locations = spark.read.parquet("OUT_DATA/nyt_locations_geography/")

In [4]:
locations.printSchema()

root
 |-- fips: string (nullable = true)
 |-- county: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- location_id: long (nullable = true)
 |-- state: string (nullable = true)



In [5]:
locations.count()

3272

In [6]:
# create and write table nyt_locations_geography in postgres
locations.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", "nyt_locations_geography")\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

AnalysisException: "Table or view 'nyt_locations_geography' already exists. SaveMode: ErrorIfExists.;"

In [None]:
nyt  = spark.read.parquet("OUT_DATA/covid_per_county/")

In [None]:
nyt.printSchema()

In [None]:
nyt.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", "covid_per_county")\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

In [3]:
map_stations_locations = spark.read.parquet("OUT_DATA/map_locations_stations/")

In [4]:
map_stations_locations.printSchema()

root
 |-- location_id: long (nullable = true)
 |-- station_id: string (nullable = true)
 |-- distance: double (nullable = true)
 |-- measured: string (nullable = true)



In [6]:
map_stations_locations.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", "map_locations_stations")\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()

In [7]:
# unique measured element, station_id
weatherelem_station = map_stations_locations.select("measured", "station_id").distinct()

In [8]:
weatherelem_station.count()

8391

In [9]:
weatherelem_station.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", "weatherelem_station")\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()    

In [8]:
weather_data = spark.read.parquet("OUT_DATA/weather_data")

In [13]:
weather_data = weather_data.withColumn("value", col("value").cast(T.IntegerType()))

In [14]:
weather_data.printSchema()

root
 |-- measured: string (nullable = true)
 |-- station_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- value: integer (nullable = true)



In [15]:
weather_data.write\
    .format("jdbc")\
    .option("url", "jdbc:postgresql:capstone")\
    .option("dbtable", "weather_data")\
    .option("user","postgres")\
    .option("password", "postgres")\
    .save()