In [1]:
import configparser
import os
import pyspark
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date, timedelta, datetime
import statsmodels
from statsmodels.formula.api import ols

In [2]:
config = configparser.ConfigParser()
config.read("capstone.cfg")

input_data_dir = config["PATH"]["HDFS_DOWNLOAD_DIR"]
s3_path = config["PATH"]["S3_OUTPUT_PATH"]

In [3]:
spark = SparkSession.builder\
    .appName("analysis")\
    .getOrCreate()

#    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0")\
#    .config("fs.s3a.access.key", config['AWS']['ACCESS_KEY_ID'])\
#    .config("fs.s3a.secret.key", config['AWS']['SECRET_ACCESS_KEY'])\


In [4]:
# FOR DEBUG ONLY
s3_path = "/home/user/CODE/BIG_DATA/CAPSTONE_PROJECT/FROM_S3"
#s3_path = "/home/user/CODE/BIG_DATA/CAPSTONE_PROJECT/test_covid_analysis/OUT_DATA"

In [5]:
covid_per_county = spark.read.parquet( os.path.join(s3_path, "covid_per_county"))

In [6]:
weather = spark.read.parquet( os.path.join(s3_path, "weather_records"))

In [7]:
nyt_locations_geography = spark.read.parquet( os.path.join(s3_path, "nyt_locations_geography"))

In [8]:
map_locations_stations = spark.read.parquet( os.path.join(s3_path, "map_locations_stations"))

In [9]:
weather = weather.withColumn("value", weather.value.cast(T.IntegerType()))

In [10]:
weather.printSchema()

root
 |-- station_id: string (nullable = true)
 |-- date: date (nullable = true)
 |-- value: integer (nullable = true)
 |-- measured: string (nullable = true)



In [7]:
covid_per_popgroup = spark.read.parquet( os.path.join(s3_path, "covid_per_popgroup"))

## Does climate impact covid propagation ?

In [11]:
def find_county_by_name(county_pattern):
    '''
    Find the location containing county_pattern in its county name
    '''
    county_location =  nyt_locations_geography.filter(nyt_locations_geography.county.contains(county_pattern)).cache()
    return county_location

In [17]:
def join_covid_weather(county_location):
    '''
    select covid and weather data for the given location, and return a pivoted pandas data frame
    (with one row per date)
    '''
    
    # find location_id with county containing county pattern
    nbcounty = county_location.count()
    if nbcounty != 1 :
        county_location.show()
        raise Exception(f"nb of counties should be 1 ({nbcounty})")
    
    # filter covid data to keep only the desired county
    filtered_covid = covid_per_county.alias("c").join(county_location, "location_id").select("c.*").orderBy("date").cache()
    filtered_covid.show()
    
    # find stations for the county
    county_station = map_locations_stations.join(county_location, "location_id").cache()
    county_station.show()
    
    # filter weather to keep only measurements for the desired location
    filtered_weather = weather.join(county_station.alias("s"), 
    (weather.measured == county_station.measured) & (weather.station_id == county_station.station_id))\
    .select("date", "value", "s.measured").cache()
    
    # pivot weather : check that there are at most 1 measurement per date 
    check_weather = filtered_weather.groupBy("date").pivot("measured").count().groupBy().max()
    max_weather = check_weather.first().asDict()
    if max(max_weather.values()) > 1:
        check_weather.show()
        filtered_covid.unpersist()
        county_station.unpersist()
        filtered_weather.unpersist()
        raise Exception(f"""There are more than 1 value per date / measurement""")
        
    # pivot weather
    pivot_weather = filtered_weather.groupBy("date").pivot("measured").sum("value").orderBy("date")
    pivot_weather.show()
    
    # join weather with covid data
    
    covid_full = filtered_covid.join(pivot_weather, "date").toPandas()
    filtered_covid.unpersist()
    county_station.unpersist()
    filtered_weather.unpersist()
  
    
    return covid_full
        

In [13]:
formula = "daily_cases ~ PRCP + TMIN + AWND + SNOW"
formula_no_snow =  "daily_cases ~ PRCP + TMIN + AWND"

### New York City

In [155]:
NY_full = join_covid_weather( find_county_by_name("New York"))

In [156]:
NY_full

Unnamed: 0,date,location_id,daily_cases,daily_deaths,AWND,PRCP,SNOW,TMIN
0,2020-03-01,8650064134144,1,0,60,0,0,-38
1,2020-03-02,8650064134144,0,0,46,0,0,44
2,2020-03-03,8650064134144,1,0,43,66,0,94
3,2020-03-04,8650064134144,0,0,82,0,0,72
4,2020-03-05,8650064134144,2,0,58,0,0,39
...,...,...,...,...,...,...,...,...
324,2021-01-19,8650064134144,5559,57,66,0,0,17
325,2021-01-20,8650064134144,4349,88,59,5,0,-27
326,2021-01-21,8650064134144,5220,82,36,0,0,-38
327,2021-01-22,8650064134144,6847,67,69,0,0,22


In [158]:
ny_model = ols(formula, data = NY_full)
res_ny = ny_model.fit()
res_ny.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.463
Model:,OLS,Adj. R-squared:,0.457
Method:,Least Squares,F-statistic:,69.96
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,1.1999999999999999e-42
Time:,10:51:15,Log-Likelihood:,-2848.6
No. Observations:,329,AIC:,5707.0
Df Residuals:,324,BIC:,5726.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3474.6183,313.578,11.081,0.000,2857.712,4091.524
PRCP,1.0232,0.983,1.040,0.299,-0.911,2.958
TMIN,-16.0330,1.047,-15.311,0.000,-18.093,-13.973
AWND,0.2892,5.218,0.055,0.956,-9.975,10.554
SNOW,-9.7050,7.488,-1.296,0.196,-24.436,5.026

0,1,2,3
Omnibus:,21.047,Durbin-Watson:,0.299
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.32
Skew:,0.49,Prob(JB):,7.09e-07
Kurtosis:,4.051,Cond. No.,598.0


In [159]:
ny_model = ols("daily_cases ~ TMIN", data = NY_full)
res_ny = ny_model.fit()
res_ny.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.459
Model:,OLS,Adj. R-squared:,0.458
Method:,Least Squares,F-statistic:,277.9
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,1.38e-45
Time:,10:51:23,Log-Likelihood:,-2849.9
No. Observations:,329,AIC:,5704.0
Df Residuals:,327,BIC:,5711.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3490.9717,131.659,26.515,0.000,3231.967,3749.977
TMIN,-15.8257,0.949,-16.669,0.000,-17.693,-13.958

0,1,2,3
Omnibus:,22.164,Durbin-Watson:,0.296
Prob(Omnibus):,0.0,Jarque-Bera (JB):,29.282
Skew:,0.519,Prob(JB):,4.38e-07
Kurtosis:,4.029,Cond. No.,236.0


### San Diego

In [160]:
san_diego_full = join_covid_weather( find_county_by_name("San Diego") )

In [161]:
san_diego_model = ols(formula, data = san_diego_full)
res_san_diego = san_diego_model.fit()
res_san_diego.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.051
Model:,OLS,Adj. R-squared:,0.04
Method:,Least Squares,F-statistic:,4.603
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,0.00124
Time:,10:52:17,Log-Likelihood:,-2914.3
No. Observations:,348,AIC:,5839.0
Df Residuals:,343,BIC:,5858.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1222.4126,215.852,5.663,0.000,797.852,1646.973
PRCP,-3.6299,1.686,-2.153,0.032,-6.946,-0.314
TMIN,-4.4903,1.136,-3.954,0.000,-6.724,-2.256
AWND,3.7740,5.921,0.637,0.524,-7.871,15.419
SNOW,0.6375,3.674,0.174,0.862,-6.589,7.863

0,1,2,3
Omnibus:,241.071,Durbin-Watson:,0.775
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2609.476
Skew:,2.841,Prob(JB):,0.0
Kurtosis:,15.153,Cond. No.,568.0


### Chicago

In [151]:
loc = find_county_by_name("Cook")
loc.show()
loc = loc.filter("state == 'Illinois'")
loc.show()

+-----+------+------------------+------------------+-------------+------------+
| fips|county|          latitude|         longitude|  location_id|       state|
+-----+------+------------------+------------------+-------------+------------+
|48097| Cooke| 33.63919448852539|-97.21034240722656|7026566496257|       Texas|
|13075|  Cook|31.152515411376953|  -83.429443359375|5446018531328|     Georgia|
|17031|  Cook| 41.89429473876953|-87.64545440673828|3444563771392|    Illinois|
|27031|  Cook|  47.7585563659668|-90.34432220458984|7352984010753|   Minnesota|
|46087|McCook|43.680416107177734|-97.35804748535156|5634997092352|South Dakota|
+-----+------+------------------+------------------+-------------+------------+

+-----+------+-----------------+------------------+-------------+--------+
| fips|county|         latitude|         longitude|  location_id|   state|
+-----+------+-----------------+------------------+-------------+--------+
|17031|  Cook|41.89429473876953|-87.64545440673828|344

In [162]:
chicago_full = join_covid_weather(loc)

In [163]:
covid_model = ols(formula, data = chicago_full)
res_covid = covid_model.fit()
res_covid.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.094
Model:,OLS,Adj. R-squared:,0.083
Method:,Least Squares,F-statistic:,9.31
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,3.6e-07
Time:,10:54:25,Log-Likelihood:,-3114.3
No. Observations:,366,AIC:,6239.0
Df Residuals:,361,BIC:,6258.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1354.4755,215.249,6.293,0.000,931.176,1777.775
PRCP,0.4681,0.692,0.676,0.499,-0.893,1.829
TMIN,-3.7953,0.687,-5.524,0.000,-5.147,-2.444
AWND,3.8250,4.319,0.886,0.376,-4.669,12.319
SNOW,-19.2875,6.693,-2.882,0.004,-32.451,-6.124

0,1,2,3
Omnibus:,86.692,Durbin-Watson:,0.397
Prob(Omnibus):,0.0,Jarque-Bera (JB):,172.939
Skew:,1.255,Prob(JB):,2.8e-38
Kurtosis:,5.246,Cond. No.,441.0


### Phoenix

In [13]:
maricopa = find_county_by_name("Maricopa")
maricopa.show()

+-----+--------+------------------+-------------------+-------------+-------+
| fips|  county|          latitude|          longitude|  location_id|  state|
+-----+--------+------------------+-------------------+-------------+-------+
|04013|Maricopa|33.345176696777344|-112.49893188476562|7919919693824|Arizona|
+-----+--------+------------------+-------------------+-------------+-------+



In [16]:
maricopa_full = join_covid_weather(maricopa)

In [21]:
covid_model = ols(formula_no_snow, data = maricopa_full)
res_covid = covid_model.fit()
res_covid.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.067
Method:,Least Squares,F-statistic:,9.565
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,4.35e-06
Time:,11:11:24,Log-Likelihood:,-3166.7
No. Observations:,356,AIC:,6341.0
Df Residuals:,352,BIC:,6357.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2425.7649,309.845,7.829,0.000,1816.385,3035.145
PRCP,-5.5949,3.874,-1.444,0.150,-13.213,2.024
TMIN,-4.9927,1.182,-4.225,0.000,-7.317,-2.669
AWND,-11.5493,11.378,-1.015,0.311,-33.926,10.827

0,1,2,3
Omnibus:,147.327,Durbin-Watson:,1.842
Prob(Omnibus):,0.0,Jarque-Bera (JB):,527.212
Skew:,1.86,Prob(JB):,3.2899999999999997e-115
Kurtosis:,7.659,Cond. No.,626.0


### Dallas

In [17]:
dallas = find_county_by_name("Dallas")
dallas = dallas.filter("state = 'Texas'").cache()
dallas.show()

+-----+------+-----------------+------------------+-------------+-----+
| fips|county|         latitude|         longitude|  location_id|state|
+-----+------+-----------------+------------------+-------------+-----+
|48113|Dallas|32.76698684692383|-96.77842712402344|8194797600768|Texas|
+-----+------+-----------------+------------------+-------------+-----+



In [21]:
dallas_full = join_covid_weather(dallas)

+-------------+----------+-----------+------------+
|  location_id|      date|daily_cases|daily_deaths|
+-------------+----------+-----------+------------+
|8194797600768|2020-03-10|          2|           0|
|8194797600768|2020-03-11|          1|           0|
|8194797600768|2020-03-12|          5|           0|
|8194797600768|2020-03-13|          1|           0|
|8194797600768|2020-03-14|          2|           0|
|8194797600768|2020-03-15|          3|           0|
|8194797600768|2020-03-16|          0|           0|
|8194797600768|2020-03-17|          9|           0|
|8194797600768|2020-03-18|         11|           0|
|8194797600768|2020-03-19|         22|           1|
|8194797600768|2020-03-20|         18|           0|
|8194797600768|2020-03-21|         21|           0|
|8194797600768|2020-03-22|         37|           1|
|8194797600768|2020-03-23|         27|           0|
|8194797600768|2020-03-24|         10|           3|
|8194797600768|2020-03-25|          0|           1|
|81947976007

In [22]:
covid_model = ols(formula_no_snow, data = dallas_full)
res_covid = covid_model.fit()
res_covid.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.153
Model:,OLS,Adj. R-squared:,0.145
Method:,Least Squares,F-statistic:,18.95
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,2.55e-11
Time:,12:07:09,Log-Likelihood:,-2585.1
No. Observations:,318,AIC:,5178.0
Df Residuals:,314,BIC:,5193.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1382.6472,152.427,9.071,0.000,1082.740,1682.554
PRCP,-0.7341,0.379,-1.935,0.054,-1.481,0.012
TMIN,-4.1254,0.567,-7.279,0.000,-5.241,-3.010
AWND,1.9942,3.503,0.569,0.570,-4.897,8.886

0,1,2,3
Omnibus:,144.229,Durbin-Watson:,1.865
Prob(Omnibus):,0.0,Jarque-Bera (JB):,850.234
Skew:,1.798,Prob(JB):,2.3700000000000002e-185
Kurtosis:,10.158,Cond. No.,615.0


### Seattle

In [14]:
seattle = find_county_by_name("King")
seattle = seattle.filter("state = 'Washington'")
seattle.show()

+-----+------+-----------------+-------------------+-------------+----------+
| fips|county|         latitude|          longitude|  location_id|     state|
+-----+------+-----------------+-------------------+-------------+----------+
|53033|  King|47.49055099487305|-121.83397674560547|7808250544128|Washington|
+-----+------+-----------------+-------------------+-------------+----------+



In [18]:
seattle_full = join_covid_weather(seattle)

+-------------+----------+-----------+------------+
|  location_id|      date|daily_cases|daily_deaths|
+-------------+----------+-----------+------------+
|7808250544128|2020-02-28|          1|           0|
|7808250544128|2020-02-29|          3|           1|
|7808250544128|2020-03-01|          7|           2|
|7808250544128|2020-03-02|          4|           3|
|7808250544128|2020-03-03|          7|           4|
|7808250544128|2020-03-04|         11|           1|
|7808250544128|2020-03-05|         19|           0|
|7808250544128|2020-03-06|          7|           1|
|7808250544128|2020-03-07|         13|           4|
|7808250544128|2020-03-08|         19|           2|
|7808250544128|2020-03-09|         26|           3|
|7808250544128|2020-03-10|         74|           2|
|7808250544128|2020-03-11|         44|           4|
|7808250544128|2020-03-12|         36|           1|
|7808250544128|2020-03-13|         58|           5|
|7808250544128|2020-03-14|         60|           3|
|78082505441

In [20]:
covid_model = ols(formula, data = seattle_full)
res_covid = covid_model.fit()
res_covid.summary()

0,1,2,3
Dep. Variable:,daily_cases,R-squared:,0.167
Model:,OLS,Adj. R-squared:,0.157
Method:,Least Squares,F-statistic:,16.09
Date:,"Tue, 29 Jun 2021",Prob (F-statistic):,5.08e-12
Time:,13:48:59,Log-Likelihood:,-2276.3
No. Observations:,326,AIC:,4563.0
Df Residuals:,321,BIC:,4581.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,340.3900,44.510,7.647,0.000,252.822,427.958
PRCP,0.9774,0.209,4.677,0.000,0.566,1.388
TMIN,-1.5750,0.312,-5.056,0.000,-2.188,-0.962
AWND,-3.1027,1.702,-1.823,0.069,-6.452,0.247
SNOW,0.0072,1.583,0.005,0.996,-3.107,3.122

0,1,2,3
Omnibus:,254.623,Durbin-Watson:,1.424
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4824.03
Skew:,3.066,Prob(JB):,0.0
Kurtosis:,20.82,Cond. No.,286.0
