In [1]:
import pandas as pd
import numpy as np
from xgboost import plot_importance, plot_tree
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
from google.cloud import bigquery
import time

In [2]:
train = pd.read_csv("../input/covid19-global-forecasting-week-2/train.csv")
test = pd.read_csv("../input/covid19-global-forecasting-week-2/test.csv")
sub = pd.read_csv("../input/covid19-global-forecasting-week-2/submission.csv")

In [3]:
df_smoking = pd.read_csv('../input/smokingstats/share-of-adults-who-smoke.csv')
df_smoking.head()

Unnamed: 0,Entity,Code,Year,"Smoking prevalence, total (ages 15+) (% of adults)"
0,Albania,ALB,2000,34.8
1,Albania,ALB,2005,32.7
2,Albania,ALB,2010,31.2
3,Albania,ALB,2011,30.7
4,Albania,ALB,2012,30.2


In [4]:
# extract newest data
df_smoking_recent = df_smoking.sort_values('Year', ascending=False).reset_index(drop=True)
df_smoking_recent = df_smoking_recent[df_smoking_recent['Entity'].duplicated()==False]
df_smoking_recent['Country/Region'] = df_smoking_recent['Entity']
df_smoking_recent['SmokingRate'] = df_smoking_recent['Smoking prevalence, total (ages 15+) (% of adults)']
df_smoking_recent['Country_Region'] = df_smoking_recent['Country/Region']
df_smoking_recent.head()

Unnamed: 0,Entity,Code,Year,"Smoking prevalence, total (ages 15+) (% of adults)",Country/Region,SmokingRate,Country_Region
0,Zimbabwe,ZWE,2016,15.8,Zimbabwe,15.8,Zimbabwe
1,Namibia,NAM,2016,21.4,Namibia,21.4,Namibia
2,Suriname,SUR,2016,25.0,Suriname,25.0,Suriname
3,Morocco,MAR,2016,23.4,Morocco,23.4,Morocco
4,India,IND,2016,11.5,India,11.5,India


In [5]:
# merge
train = pd.merge(train, df_smoking_recent[['Country_Region', 'SmokingRate']], on='Country_Region', how='left')
test = pd.merge(test, df_smoking_recent[['Country_Region', 'SmokingRate']], on='Country_Region', how='left')

In [6]:
%%time
client = bigquery.Client()
dataset_ref = client.dataset("noaa_gsod", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)

tables = list(client.list_tables(dataset))

table_ref = dataset_ref.table("stations")
table = client.get_table(table_ref)
stations_df = client.list_rows(table).to_dataframe()

table_ref = dataset_ref.table("gsod2020")
table = client.get_table(table_ref)
twenty_twenty_df = client.list_rows(table).to_dataframe()

stations_df['STN'] = stations_df['usaf'] + '-' + stations_df['wban']
twenty_twenty_df['STN'] = twenty_twenty_df['stn'] + '-' + twenty_twenty_df['wban']

cols_1 = ['STN', 'mo', 'da', 'temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']
cols_2 = ['STN', 'country', 'state', 'call', 'lat', 'lon', 'elev']
weather_df = twenty_twenty_df[cols_1].join(stations_df[cols_2].set_index('STN'), on='STN')

weather_df.tail(10)

Using Kaggle's public dataset BigQuery integration.
CPU times: user 2min 25s, sys: 4.81 s, total: 2min 30s
Wall time: 5min 48s


Unnamed: 0,STN,mo,da,temp,min,max,stp,wdsp,prcp,fog,country,state,call,lat,lon,elev
1009248,726605-00386,1,28,34.4,30.2,41.0,880.2,8.2,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009249,726605-00386,1,30,33.9,28.4,37.4,876.8,7.7,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009250,726605-00386,2,7,28.0,26.6,30.2,871.4,12.4,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009251,726605-00386,2,12,26.6,10.4,35.6,875.9,11.8,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009252,726605-00386,2,27,34.0,30.2,39.2,882.4,12.0,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009253,726605-00386,3,3,35.5,30.2,41.0,870.1,13.4,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009254,726605-00386,3,5,34.4,30.2,42.8,885.2,13.4,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009255,726605-00386,3,12,35.5,28.4,55.4,875.8,11.8,99.99,1,US,SD,KSPF,44.483,-103.783,1198.2
1009256,724084-54760,1,18,23.9,19.0,34.0,27.1,4.4,0.0,1,US,NJ,KBLM,40.183,-74.133,48.5
1009257,724084-54760,2,29,31.3,27.0,42.1,6.0,10.7,0.0,1,US,NJ,KBLM,40.183,-74.133,48.5


In [7]:
from scipy.spatial.distance import cdist

weather_df['day_from_jan_first'] = (weather_df['da'].apply(int)
                                   + 31*(weather_df['mo']=='02') 
                                   + 60*(weather_df['mo']=='03')
                                   + 91*(weather_df['mo']=='04')  
                                   )

mo = train['Date'].apply(lambda x: x[5:7])
da = train['Date'].apply(lambda x: x[8:10])
train['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

C = []
for j in train.index:
    df = train.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.Id, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
train['closest_station'] = C

train = train.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
train.sort_values(by=['Id'], inplace=True)
train.head()

KeyError: "['Long', 'Lat'] not in index"

In [8]:
weather_df['day_from_jan_first'] = (weather_df['da'].apply(int)
                                   + 31*(weather_df['mo']=='02') 
                                   + 60*(weather_df['mo']=='03')
                                   + 91*(weather_df['mo']=='04')  
                                   )

mo = test['Date'].apply(lambda x: x[5:7])
da = test['Date'].apply(lambda x: x[8:10])
test['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

C = []
for j in test.index:
    df = test.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.ForecastId, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
test['closest_station'] = C

test = test.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
test.sort_values(by=['ForecastId'], inplace=True)
test.head()

KeyError: "['Long', 'Lat'] not in index"

In [9]:
train["wdsp"] = pd.to_numeric(train["wdsp"])
test["wdsp"] = pd.to_numeric(test["wdsp"])
train["fog"] = pd.to_numeric(train["fog"])
test["fog"] = pd.to_numeric(test["fog"])

KeyError: 'wdsp'

In [10]:
#drop cols
X_train = train.drop(['ConfirmedCases','Fatalities','Id'],1)

In [11]:
# make date a datetime object
X_train['Date'] = pd.to_datetime(X_train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [12]:
def create_time_features(df):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df['Date'].dt.hour
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [13]:
create_time_features(X_train)
create_time_features(test)

Unnamed: 0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
0,0,3,1,3,2020,79,19,12
1,0,4,1,3,2020,80,20,12
2,0,5,1,3,2020,81,21,12
3,0,6,1,3,2020,82,22,12
4,0,0,1,3,2020,83,23,13
...,...,...,...,...,...,...,...,...
12637,0,6,2,4,2020,117,26,17
12638,0,0,2,4,2020,118,27,18
12639,0,1,2,4,2020,119,28,18
12640,0,2,2,4,2020,120,29,18


In [14]:
X_train.drop("Date", axis=1, inplace=True)
test.drop("Date", axis=1, inplace=True)

In [15]:
X_train = pd.concat([X_train,pd.get_dummies(X_train['Province_State'], prefix='ps')],axis=1)
X_train.drop(['Province_State'],axis=1, inplace=True)
test = pd.concat([test,pd.get_dummies(test['Province_State'], prefix='ps')],axis=1)
test.drop(['Province_State'],axis=1, inplace=True)

X_train = pd.concat([X_train,pd.get_dummies(X_train['Country_Region'], prefix='cr')],axis=1)
X_train.drop(['Country_Region'],axis=1, inplace=True)
test = pd.concat([test,pd.get_dummies(test['Country_Region'], prefix='cr')],axis=1)
test.drop(['Country_Region'],axis=1, inplace=True)

In [16]:
y_train_fat = train['Fatalities']
y_train_cc = train['ConfirmedCases']

In [17]:
xgb_fat = xgb.XGBRegressor(n_estimators=1000)

In [18]:
xgb_fat.fit(X_train,y_train_fat,verbose=True)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [19]:
xgb_cc = xgb.XGBRegressor(n_estimators=1000)

In [20]:
xgb_cc.fit(X_train,y_train_cc,verbose=True)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [21]:
fat_pred = xgb_fat.predict(test.drop('ForecastId',1))
cc_pred = xgb_cc.predict(test.drop('ForecastId',1))

In [22]:
sub['ConfirmedCases'] = cc_pred
sub['Fatalities'] = fat_pred
sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,19.626612,-0.076131
1,2,20.653229,-0.001374
2,3,19.360353,-0.008848
3,4,37.710598,0.963651
4,5,39.360573,0.875279


In [23]:
sub.loc[sub['Fatalities'] < 0, 'Fatalities'] = 0
sub['Fatalities'] = np.round(sub['Fatalities'],0)
sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,19.626612,0.0
1,2,20.653229,0.0
2,3,19.360353,0.0
3,4,37.710598,1.0
4,5,39.360573,1.0


In [24]:
sub.to_csv('submission.csv',index=False)

In [25]:
test_sub = pd.read_csv('submission.csv')
test_sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,19.626612,0.0
1,2,20.653229,0.0
2,3,19.360353,0.0
3,4,37.710598,1.0
4,5,39.360573,1.0
