In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import json
from pathlib import Path
from google.cloud import bigquery

import matplotlib.pyplot as plt
from matplotlib import colors

import seaborn as sns

import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

>** What is the weather like in the regions that were mostly affected by Covid? 
Can we draw any conclusion on which weather conditions favored Covid spread? 
Covid week 1 database provides geographical information (latitude and longitude) and number of Fatalities and Confirmed Cases in 284 location from 163 countries across the world from 22-Jan-2020 to 24-March-2020 (once a day for each lat and lon position). In the future I could address this using a time series approach, in this notebook I used a RandomForest predictive model.
> Becasue I am treating the Covid information as non-serial (for the serial analysis I will need other approaches), the ConfirmedCases and Fatalities per day are non-cumulative and assumed independent. **

In [None]:
train = pd.read_csv( '/kaggle/input/covid19-global-forecasting-week-1/train.csv')
train=train.rename(columns={"Lat": "lat", "Long": "lon"})
train

In [None]:
train['Id'].groupby(train["Country/Region"]).agg(['count'])

In [None]:
mo = train['Date'].apply(lambda x: x[5:7])
da = train['Date'].apply(lambda x: x[8:10])
train['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

train=train.rename(columns={'Country/Region': 'Country'})
list(train.Country[(train.day_from_jan_first==80)])
# train[(train.day_from_jan_first==80)]

In [None]:
geom= [Point(xy) for xy in zip(train['lon'], train['lat'])]

crs={'init': 'epsg:4326'}
geo_df= gpd.GeoDataFrame(train, crs=crs, geometry= geom)
fig, ax= plt.subplots(figsize = (15,15))
geo_df.plot(ax= ax, markersize=20, marker= "o")

> **BigQuery Public Data offers the NOAA_GSOD database with weather information (GSOD2020 table) recorded in 29745 stations worldwide (stations table). Among the GSOD measures I selected 6 (mean temperature of the day, mean dewpoint, mean sealevel pressure, mean wind speed, total precipitation, snow depth). **

In [None]:
# Set your own project id here
PROJECT_ID = 'your-google-cloud-project'
from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID)

In [None]:
table1_stations = bigquery.TableReference.from_string(
    "bigquery-public-data.noaa_gsod.stations"
)

dataframe_stations = client.list_rows(
    table1_stations,
    selected_fields=[
        bigquery.SchemaField("usaf", "STRING"), #station number, world metherorological org
        bigquery.SchemaField("wban", "STRING"), #wban number, weather bureau army
        bigquery.SchemaField("country", "STRING"),
        bigquery.SchemaField("lat", "FLOAT"),
        bigquery.SchemaField("lon", "FLOAT"),
    ],
).to_dataframe()

dataframe_stations

In [None]:
table1_gsod2020 = bigquery.TableReference.from_string(
    "bigquery-public-data.noaa_gsod.gsod2020"
)

dataframe_gsod2020= client.list_rows(table1_gsod2020,
    selected_fields=[
        bigquery.SchemaField("stn", "STRING"), #station number
        bigquery.SchemaField("wban", "STRING"), #station number
        bigquery.SchemaField("year", "INTEGER"),
        bigquery.SchemaField("mo", "INTEGER"),
        bigquery.SchemaField("da", "INTEGER"),
        bigquery.SchemaField("temp", "FLOAT"), #mean temp of the day
        bigquery.SchemaField("dewp", "FLOAT"), #mean_dew_point
        bigquery.SchemaField("slp", "FLOAT"), #mean_sealevel_pressure
        bigquery.SchemaField("wdsp", "FLOAT"), #mean_wind_speed
        bigquery.SchemaField("prcp", "FLOAT"), #total_precipitation
        bigquery.SchemaField("sndp", "FLOAT"), #snow_depth
    ],).to_dataframe()

dataframe_gsod2020

In [None]:
stations_df= dataframe_stations
twenty_twenty_df= dataframe_gsod2020

>** I merged the two NOAA_GSOD database tables on the common columns (usaf and wban, which are stations identifiers)

>Then I created two new weather measures: the relative humidity and the actual vapour pressure (in pascals), for source of these measures refer to notebook https://www.kaggle.com/davidbnn92/weather-data/  which is the inspiration to this analysis!

>I will use the new column 'day_from_jan_first' to merge the new weather table with the Covid dataset. **

In [None]:
stations_df['STN'] = stations_df['usaf'] + '-' + stations_df['wban']
twenty_twenty_df['STN'] = twenty_twenty_df['stn'] + '-' + twenty_twenty_df['wban']
cols_1= list(twenty_twenty_df.columns)
cols_2= list(stations_df.columns)
weather_df = twenty_twenty_df[cols_1].join(stations_df[cols_2].set_index('STN'), on='STN',  how='left', lsuffix='_left', rsuffix='_right')

weather_df['temp'] = weather_df['temp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['slp'] = weather_df['slp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['dewp'] = weather_df['dewp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['wdsp'] = weather_df['wdsp'].apply(lambda x: np.nan if x==999.9 else x)
weather_df['prcp'] = weather_df['prcp'].apply(lambda x: np.nan if x==999.9 else x)
weather_df['sndp'] = weather_df['sndp'].apply(lambda x: np.nan if x==999.9 else x)

# convert everything into celsius
temp = (weather_df['temp'] - 32) / 1.8
dewp = (weather_df['dewp'] - 32) / 1.8
    
# compute relative humidity as ratio between actual vapour pressure (computed from dewpoint temperature)
# and saturation vapour pressure (computed from temperature) (the constant 6.1121 cancels out)
weather_df['rh'] = (np.exp((18.678*dewp)/(257.14+dewp))/np.exp((18.678*temp)/(257.14+temp)))

# calculate actual vapour pressure (in pascals)
# then use it to compute absolute humidity from the gas law of vapour 
# (ah = mass / volume = pressure / (constant * temperature))
weather_df['ah'] = ((np.exp((18.678*dewp)/(257.14+dewp))) * 6.1121 * 100) / (461.5 * temp)


weather_df['month']= weather_df['mo']
weather_df['day']= weather_df['da']
weather_df['Date']=pd.to_datetime(weather_df[['year','month','day']])
weather_df['Date2']= weather_df['Date']
weather_df['Date2']= weather_df['Date2'].astype('str')
mo2 = weather_df['Date2'].apply(lambda x: x[5:7])
da2 = weather_df['Date2'].apply(lambda x: x[8:10])
weather_df['day_from_jan_first'] = (da2.apply(int)
                               + 31*(mo2=='02') 
                               + 60*(mo2=='03')
                               + 91*(mo2=='04')  
                              )


In [None]:
geom= [Point(xy) for xy in zip(weather_df['lon'], weather_df['lat'])]
crs={'init': 'epsg:4326'}
geo_df= gpd.GeoDataFrame(weather_df, crs=crs, geometry= geom)
fig, ax= plt.subplots(figsize = (15,15))
geo_df.plot(ax= ax, markersize=20, marker= "o")

> **The level of geospatial granularity for the Covid table is not as fine as the weather table. 
Therefore, I am selecting from the weather table only the locations in the Covid database: THIS APPROACH MIGHT LEAD TO INACCURATE PREDICTIONS WHEN THE COVID AREAS ARE VERY BROAD (COVERING DIFFERENT WEATHER REGIONS).

> Also I am extracting from the weather table only the COVID days **

In [None]:
weather_df= weather_df.dropna(subset = ['lat', 'lon'])
weather_df = weather_df.reset_index(drop=True)
train= train.dropna(subset = ['lat', 'lon'])
train = train.reset_index(drop=True)
weather_df.lon= weather_df.lon.astype(int)
weather_df.lat= weather_df.lat.astype(int)
train.lon= train.lon.astype(int)
train.lat= train.lat.astype(int)

CovidWeather=train.merge(weather_df, on=['lat', 'lon', 'day_from_jan_first'], how='left')
CovidWeather

> **Now I start creating the features and outcome split **

In [None]:
columns_X = ["lat", "lon","temp", "dewp", "slp", "wdsp", "prcp", "sndp", "rh", "ah"]
columns_y= [ "ConfirmedCases", "Fatalities" ]


weather_PerDay2=CovidWeather[["lat", "lon","temp", "dewp", "slp", "wdsp", "prcp", "sndp", "rh", "ah", "ConfirmedCases", "Fatalities"]]
weather_PerDay2.replace([np.inf, -np.inf], np.nan, inplace=True)

weather_PerDay2.data= CovidWeather[columns_X]
weather_PerDay2.target= CovidWeather[columns_y]

> **I also do the test train split on the resulting dataset. I made a lot of changes to the train dataset to integrate it with the weather tables and I do not feel like repeating those changes on the Test data provided by Kaggle. **

> **I impute and scale the data and proceed with RandomForestRegressor**

> **Finally I list the most important features**

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    weather_PerDay2.data, weather_PerDay2.target, random_state=42)

X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train_full.replace([np.inf, -np.inf], np.nan, inplace=True)

imputer = SimpleImputer(missing_values= np.nan, strategy='mean')
imputer=imputer.fit(X_train_full)
X_train_full = imputer.transform(X_train_full)
X_train_full

imputer = SimpleImputer(missing_values= np.nan, strategy='mean')
imputer=imputer.fit(X_test)
X_test = imputer.transform(X_test)
X_test

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test)


rnd_reg= RandomForestRegressor()
rnd_reg.fit (X_train_scaled, y_train_full)

importances = list(rnd_reg.feature_importances_)
importances


> **As you can see longitude is the most important feature followed by dewpoint and latitude **

> **the only weather-related predictor is dewpoint which represents the temperature to which air must be cooled to become saturated with water vapor. When further cooled, the airborne water vapour will condense to form liquid water.**

>**Given the information that we have about the transmission of Coronavirus-19 it seems reasonable that the dew point level might be related to Covid spread**

> **Of course we understand that trying to connect the daily dewpoint to the daily Covid cases is inappropriate: most likely the Confirmed Cases are the result of exposure to the virus several days before and Fatalities several weeks in advance **

> **Let's indulge with this inappropriate analysis decision for the sake of learning and immediate rewarding!**

In [None]:
feature_list = list(weather_PerDay2.data.columns)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances

> **The mean squared error is below**
> **Please help me out in understanding if this is acceptable or not**

In [None]:
y_pred= rnd_reg.predict(X_test)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

> **The following figures are trying to visually show the dew point in areas that were more exposed to Covid. **

In [None]:
plt.scatter(weather_PerDay2.dewp,weather_PerDay2.ConfirmedCases)
plt.show()
plt.scatter(weather_PerDay2.dewp,weather_PerDay2.Fatalities)
plt.show()

> **Which locations worldwide have dew point values between 30 and 60 °F? Answer: The places with a critical dew point are spread all over the globe but some places are safe :**

> **This visualization includes all the locations worldwide that have a dew point between 30 and 60 °F.**

In [None]:
Selectdf= weather_df[(weather_df['dewp'].apply(lambda x:x>=30 and x<=60))]
# Selectdwep
crs={'init': 'epsg:4326'}
geom_Selectdf= [Point(xy) for xy in zip(Selectdf['lon'], Selectdf['lat'])]
geo_Selectdf= gpd.GeoDataFrame(Selectdf, crs=crs, geometry= geom_Selectdf) #the one with critical dewpoint

geom= [Point(xy) for xy in zip(weather_df['lon'], weather_df['lat'])]
geo_df= gpd.GeoDataFrame(weather_df, crs=crs, geometry= geom)

fig, ax= plt.subplots(figsize = (15,15))
geo_df.plot(ax= ax, markersize=20, color= 'b', marker= "o")
geo_Selectdf.plot(ax= ax, markersize=20, color= 'r', marker= "o")



> **This visualization includes all the Covid locations worldwide that have a dew point between 30 and 60 °F.**

> **If planning a vacation (when the travel ban is relieved), take into account this visualization**

In [None]:
Selectdewp= weather_PerDay2[(weather_PerDay2['dewp'].apply(lambda x:x>=30 and x<=60))]
# Selectdwep

geom_Selectdewp= [Point(xy) for xy in zip(Selectdewp['lon'], Selectdewp['lat'])]
geo_Selectdewp= gpd.GeoDataFrame(Selectdewp, crs=crs, geometry= geom_Selectdewp) #the one with critical dewpoint

geom= [Point(xy) for xy in zip(weather_PerDay2['lon'], weather_PerDay2['lat'])]
geo_df= gpd.GeoDataFrame(weather_PerDay2, crs=crs, geometry= geom) #all dataset


fig, ax= plt.subplots(figsize = (15,15))
geo_df.plot(ax= ax, markersize=20, color= 'b', marker= "o")
geo_Selectdewp.plot(ax= ax, markersize=20, color= 'r', marker= "o")

> ** Let's tune some hyperparameters using GridSearch**

In [None]:
param_grid = [{ "n_estimators": [9, 10] , "max_features" : [7, 10]}]
        
score='neg_mean_squared_error'
   
classifier= RandomForestRegressor()
gridsearch = GridSearchCV(classifier,param_grid, scoring = score, cv = 5)
my_model= gridsearch.fit(X_train_scaled, y_train_full)
cv_results= gridsearch.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(np.sqrt(-mean_score), params)


In [None]:
y_pred= my_model.predict(X_test)

from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
    

> **Also with these hyperparameters tuning the mean_squared_error is similar to the one above**