In [1]:
import pandas as pd
import numpy as np
from xgboost import plot_importance, plot_tree
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
from google.cloud import bigquery

In [2]:
train = pd.read_csv("../input/covid19-global-forecasting-week-1/train.csv")
test = pd.read_csv("../input/covid19-global-forecasting-week-1/test.csv")
sub = pd.read_csv("../input/covid19-global-forecasting-week-1/submission.csv")

# Feature Engineering
---

In [3]:
%%time
client = bigquery.Client()
dataset_ref = client.dataset("noaa_gsod", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)

tables = list(client.list_tables(dataset))

table_ref = dataset_ref.table("stations")
table = client.get_table(table_ref)
stations_df = client.list_rows(table).to_dataframe()

table_ref = dataset_ref.table("gsod2020")
table = client.get_table(table_ref)
twenty_twenty_df = client.list_rows(table).to_dataframe()

stations_df['STN'] = stations_df['usaf'] + '-' + stations_df['wban']
twenty_twenty_df['STN'] = twenty_twenty_df['stn'] + '-' + twenty_twenty_df['wban']

cols_1 = ['STN', 'mo', 'da', 'temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']
cols_2 = ['STN', 'country', 'state', 'call', 'lat', 'lon', 'elev']
weather_df = twenty_twenty_df[cols_1].join(stations_df[cols_2].set_index('STN'), on='STN')

weather_df.tail(10)

Using Kaggle's public dataset BigQuery integration.
CPU times: user 2min 14s, sys: 4.35 s, total: 2min 18s
Wall time: 5min 6s


Unnamed: 0,STN,mo,da,temp,min,max,stp,wdsp,prcp,fog,country,state,call,lat,lon,elev
929644,477040-99999,1,12,44.1,42.8,48.2,999.9,7.7,99.99,1,JA,,RJNK,36.395,136.407,11.0
929645,477040-99999,1,19,45.0,37.4,51.8,999.9,7.0,99.99,1,JA,,RJNK,36.395,136.407,11.0
929646,477040-99999,2,8,37.3,32.0,44.6,999.9,13.8,99.99,1,JA,,RJNK,36.395,136.407,11.0
929647,477040-99999,2,17,38.9,32.0,44.6,999.9,17.0,99.99,1,JA,,RJNK,36.395,136.407,11.0
929648,477040-99999,3,15,41.6,33.8,53.6,999.9,10.9,99.99,1,JA,,RJNK,36.395,136.407,11.0
929649,476040-99999,3,5,39.8,37.4,46.8,13.3,12.6,0.12,1,JA,,,37.9,139.017,5.7
929650,710790-99999,1,5,5.1,2.3,6.8,980.8,5.5,0.18,1,CA,,CYTH,55.801,-97.864,222.2
929651,711210-99999,2,20,22.0,12.2,37.4,935.2,7.5,0.02,1,CA,,CYED,53.667,-113.467,688.0
929652,722700-23044,2,11,48.4,36.0,60.1,877.6,6.9,0.19,1,US,TX,KELP,31.811,-106.376,1194.2
929653,722593-03985,1,10,68.1,66.2,69.8,984.7,12.6,99.99,1,US,TX,KFWS,32.565,-97.308,213.4


In [4]:
from scipy.spatial.distance import cdist

weather_df['day_from_jan_first'] = (weather_df['da'].apply(int)
                                   + 31*(weather_df['mo']=='02') 
                                   + 60*(weather_df['mo']=='03')
                                   + 91*(weather_df['mo']=='04')  
                                   )

mo = train['Date'].apply(lambda x: x[5:7])
da = train['Date'].apply(lambda x: x[8:10])
train['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

C = []
for j in train.index:
    df = train.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.Id, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
train['closest_station'] = C

train = train.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
train.sort_values(by=['Id'], inplace=True)
train.head()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities,day_from_jan_first,temp,min,max,stp,wdsp,prcp,fog
7895,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0,22,42.6,33.6,54.9,999.9,9.4,0.0,0
15727,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0,23,42.0,32.7,55.9,999.9,14.9,99.99,1
17706,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0,24,40.1,36.9,43.2,999.9,10.4,0.17,1
15728,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0,25,46.0,37.9,56.3,999.9,6.1,0.57,1
15729,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0,26,42.8,36.1,53.1,999.9,10.8,0.0,1


In [5]:
weather_df['day_from_jan_first'] = (weather_df['da'].apply(int)
                                   + 31*(weather_df['mo']=='02') 
                                   + 60*(weather_df['mo']=='03')
                                   + 91*(weather_df['mo']=='04')  
                                   )

mo = test['Date'].apply(lambda x: x[5:7])
da = test['Date'].apply(lambda x: x[8:10])
test['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

C = []
for j in test.index:
    df = test.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.ForecastId, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
test['closest_station'] = C

test = test.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
test.sort_values(by=['ForecastId'], inplace=True)
test.head()

Unnamed: 0,ForecastId,Province/State,Country/Region,Lat,Long,Date,day_from_jan_first,temp,min,max,stp,wdsp,prcp,fog
12174,1,,Afghanistan,33.0,65.0,2020-03-12,72,30.7,16.9,47.7,775.4,2.9,0.0,1
10462,2,,Afghanistan,33.0,65.0,2020-03-13,73,35.7,24.3,53.1,778.8,3.0,0.0,1
12008,3,,Afghanistan,33.0,65.0,2020-03-14,74,36.4,26.8,53.4,778.2,5.3,0.0,1
2781,4,,Afghanistan,33.0,65.0,2020-03-15,75,40.1,29.3,52.7,777.7,9.0,0.0,0
2782,5,,Afghanistan,33.0,65.0,2020-03-16,76,40.1,29.3,52.7,777.7,9.0,0.0,0


In [6]:
train["wdsp"] = pd.to_numeric(train["wdsp"])
test["wdsp"] = pd.to_numeric(test["wdsp"])
train["fog"] = pd.to_numeric(train["fog"])
test["fog"] = pd.to_numeric(test["fog"])

In [7]:
#calculate lags & trends
def calculate_trend(df, lag_list, column):
    for lag in lag_list:
        trend_column_lag = "Trend_" + column + "_" + str(lag)
        df[trend_column_lag] = (df[column]-df[column].shift(lag, fill_value=-999))/df[column].shift(lag, fill_value=0)
    return df

def calculate_lag(df, lag_list, column):
    for lag in lag_list:
        column_lag = column + "_" + str(lag)
        df[column_lag] = df[column].shift(lag, fill_value=0)
    return df

ts = time.time()
train = calculate_lag(train, range(1,7), 'ConfirmedCases')
test = calculate_lag(test, range(1,7), 'ConfirmedCases')
train = calculate_lag(train, range(1,7), 'Fatalities')
test = calculate_lag(test, range(1,7), 'Fatalities')
train = calculate_trend(train, range(1,7), 'ConfirmedCases')
test = calculate_trend(test, range(1,7), 'ConfirmedCases')
train = calculate_trend(train, range(1,7), 'Fatalities')
test = calculate_trend(test, range(1,7), 'Fatalities')
train.replace([np.inf, -np.inf], 0, inplace=True)
test.replace([np.inf, -np.inf], 0, inplace=True)
print("Time spent: ", time.time()-ts)

NameError: name 'time' is not defined

In [8]:
#drop cols
X_train = train.drop(['ConfirmedCases','Fatalities','Id'],1)

In [9]:
# make date a datetime object
X_train['Date'] = pd.to_datetime(X_train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

In [10]:
def create_time_features(df):
    """
    Creates time series features from datetime index
    """
    df['hour'] = df['Date'].dt.hour
    df['dayofweek'] = df['Date'].dt.dayofweek
    df['quarter'] = df['Date'].dt.quarter
    df['month'] = df['Date'].dt.month
    df['year'] = df['Date'].dt.year
    df['dayofyear'] = df['Date'].dt.dayofyear
    df['dayofmonth'] = df['Date'].dt.day
    df['weekofyear'] = df['Date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [11]:
create_time_features(X_train)
create_time_features(test)

Unnamed: 0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
12174,0,3,1,3,2020,72,12,11
10462,0,4,1,3,2020,73,13,11
12008,0,5,1,3,2020,74,14,11
2781,0,6,1,3,2020,75,15,11
2782,0,0,1,3,2020,76,16,12
...,...,...,...,...,...,...,...,...
10440,0,6,2,4,2020,110,19,16
10441,0,0,2,4,2020,111,20,17
10442,0,1,2,4,2020,112,21,17
10443,0,2,2,4,2020,113,22,17


In [12]:
X_train.drop("Date", axis=1, inplace=True)
test.drop("Date", axis=1, inplace=True)

In [13]:
X_train = pd.concat([X_train,pd.get_dummies(X_train['Province/State'], prefix='ps')],axis=1)
X_train.drop(['Province/State'],axis=1, inplace=True)
test = pd.concat([test,pd.get_dummies(test['Province/State'], prefix='ps')],axis=1)
test.drop(['Province/State'],axis=1, inplace=True)

In [14]:
X_train = pd.concat([X_train,pd.get_dummies(X_train['Country/Region'], prefix='cr')],axis=1)
X_train.drop(['Country/Region'],axis=1, inplace=True)
test = pd.concat([test,pd.get_dummies(test['Country/Region'], prefix='cr')],axis=1)
test.drop(['Country/Region'],axis=1, inplace=True)

In [15]:
y_train_fat = train['Fatalities']
y_train_cc = train['ConfirmedCases']

In [16]:
xgb_fat = xgb.XGBRegressor(n_estimators=1000)

In [17]:
xgb_fat.fit(X_train,y_train_fat,verbose=True)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [18]:
xgb_cc = xgb.XGBRegressor(n_estimators=1000)

In [19]:
xgb_cc.fit(X_train,y_train_cc,verbose=True)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

In [20]:
fat_pred = xgb_fat.predict(test.drop('ForecastId',1))
cc_pred = xgb_cc.predict(test.drop('ForecastId',1))

In [21]:
sub['ConfirmedCases'] = cc_pred
sub['Fatalities'] = fat_pred
sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,6.776459,0.045079
1,2,7.166162,0.004301
2,3,9.802483,0.001214
3,4,19.057188,-0.017568
4,5,21.083597,-0.010502


In [22]:
sub.loc[sub['Fatalities'] < 0, 'Fatalities'] = 0
sub['Fatalities'] = np.round(sub['Fatalities'],0)
sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,6.776459,0.0
1,2,7.166162,0.0
2,3,9.802483,0.0
3,4,19.057188,0.0
4,5,21.083597,0.0


In [23]:
sub.to_csv('submission.csv',index=False)

In [24]:
test_sub = pd.read_csv('submission.csv')
test_sub.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,6.776459,0.0
1,2,7.166163,0.0
2,3,9.802483,0.0
3,4,19.057188,0.0
4,5,21.083597,0.0
