In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta

In [7]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

['capstone.cfg']

In [3]:
# add driver postgress to enable to load from existing postgres DB
spark = SparkSession \
    .builder \
    .appName("filter_weather") \
    .config("spark.jars", "postgresql-42.2.18.jar")\
    .config( "spark.driver.extraClassPath", "postgresql-42.2.18.jar")\
    .getOrCreate()

In [8]:
project_path = config["PATH"]["PROJECT"]

In [9]:
nyt_path = os.path.join(project_path, "DATA", "us-counties.txt")

In [10]:
nyt = pd.read_csv(nyt_path,  sep = ",")

In [11]:
nyt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1124916 entries, 0 to 1124915
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   date    1124916 non-null  object 
 1   county  1124916 non-null  object 
 2   state   1124916 non-null  object 
 3   fips    1114603 non-null  float64
 4   cases   1124916 non-null  int64  
 5   deaths  1100383 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 51.5+ MB


In [12]:
nyt.sort_values("date", ascending = True, inplace = True)

In [13]:
l_elements = ["TMIN", "TMAX", "TAVG", "PRCP", "SNOW", "SNWD"]

In [14]:
l_dates = nyt["date"].unique()

In [15]:
#nyt[l_elemnts] = np.nan

In [16]:
nyt

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0
1,2020-01-22,Snohomish,Washington,53061.0,1,0.0
2,2020-01-23,Snohomish,Washington,53061.0,1,0.0
3,2020-01-24,Cook,Illinois,17031.0,1,0.0
4,2020-01-24,Snohomish,Washington,53061.0,1,0.0
...,...,...,...,...,...,...
1122755,2021-03-15,Metcalfe,Kentucky,21169.0,986,21.0
1122756,2021-03-15,Monroe,Kentucky,21171.0,1254,38.0
1122757,2021-03-15,Montgomery,Kentucky,21173.0,2355,27.0
1122747,2021-03-15,Martin,Kentucky,21159.0,962,14.0


In [28]:
weather_path = os.path.join(project_path, "OUT_DATA", "filtered_weather")
d_nyt_elem ={}
for elem in l_elements:
    print(elem)
    elem_path = os.path.join(weather_path, f"element={elem}")
    if not os.path.exists(weather_path):
        print(f"path {weather_path} does not exist")
    if not os.path.exists(elem_path):
        print(f"path {elem_path} does not exist")
    l_df = []
    prev_month = 0
    for date in l_dates :
        pd_date = pd.to_datetime(date)
        if pd_date.month != prev_month:
            print(date)
        prev_month = pd_date.month
        #print(pd_date.year, pd_date.month, pd_date.day)
        date_path = os.path.join(elem_path, f"date={pd_date.year}{pd_date.month:02d}{pd_date.day:02d}")
        if not os.path.exists(date_path):
            print(f"path {date_path} does not exist")
            continue
        #nyt.loc[ nyt["date"] == date, elem ]
        #sp_weather = spark.read.load(date_path, format = "parquet")
        #sp_weather.printSchema()
        weather = pd.read_parquet(date_path)
        l_df.append( pd.merge( nyt[nyt["date"] == date], weather, how = "left", 
                                    left_on = ["county", "state", "fips"],
                                    right_on = ["county", "state_gazeeter", "fips"])
                   )
    print("CONCATENATE")
    d_nyt_elem[elem] = pd.concat(l_df)    
        

TMIN
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMIN/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

TMAX
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TMAX/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

TAVG
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=TAVG/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

PRCP
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=PRCP/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

SNOW
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNOW/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

SNWD
2020-01-21
2020-02-01
2020-03-01
2020-04-01
2020-05-01
2020-06-01
2020-07-01
2020-08-01
2020-09-01
2020-10-01
2020-11-01
2020-12-01
2021-01-01
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210101 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210102 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210103 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210104 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210105 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weather/element=SNWD/date=20210106 does not exist
path /home/severine/MOOCS/UDACITY/BIG_DATA/CAPSTONE_PROJECT/OUT_DATA/filtered_weathe

In [29]:
d_nyt_elem["PRCP"]

Unnamed: 0,date,county,state,fips,cases,deaths,state_gazeeter,station_id,v1
0,2020-01-21,Snohomish,Washington,53061.0,1,0.0,Washington,US1WASN0091,64
0,2020-01-22,Snohomish,Washington,53061.0,1,0.0,Washington,US1WASN0091,64
0,2020-01-23,Snohomish,Washington,53061.0,1,0.0,Washington,US1WASN0091,147
0,2020-01-24,Cook,Illinois,17031.0,1,0.0,Illinois,US1ILCK0074,74
1,2020-01-24,Snohomish,Washington,53061.0,1,0.0,Washington,US1WASN0091,64
...,...,...,...,...,...,...,...,...,...
3240,2020-12-31,Montgomery,Kentucky,21173.0,1378,12.0,Kentucky,USC00155640,157
3241,2020-12-31,Meade,Kentucky,21163.0,1133,4.0,Kentucky,US1KYMD0002,353
3242,2020-12-31,McLean,Kentucky,21149.0,549,21.0,,,
3243,2020-12-31,Lewis,Kentucky,21135.0,775,23.0,Kentucky,US1KYLW0003,145
