## Linear regression analysis for the COVID 19 Global Forecasting Challenge Week 5Â¶

Updated model from Week 4 to adjust for revised training data and evaluation metric - not enough time to make further changes to obtain better fit to training data.

### Data import

The external data for the submission has been derived from some of the World Development Indicators from the World Bank Open Data (Population, GDP and health spending) - with some adjustments and estimates for missing data items.

You can find the full dateset and licence here: https://www.kaggle.com/theworldbank/world-development-indicators

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from datetime import datetime

pd.set_option('display.max_rows', None)

train = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-5/train.csv")
train.columns = ["Id","Cty","Prov","Ctry","Pop","Weight","Date","Type","Value"]
test = pd.read_csv("../input/covid19-global-forecasting-week-5/test.csv")
test.columns = ["Id","Cty","Prov","Ctry","Pop","Weight","Date","Type"]
test["Value"]=0
train["Date"]= pd.to_datetime(train.Date,infer_datetime_format=True)
test["Date"]= pd.to_datetime(test.Date,infer_datetime_format=True)
world = pd.read_csv("/kaggle/input/wb0904/WorldBankData.csv")
for col in world.columns[1:]:
    avgcol= world[col].mean()
    world.loc[world[col]==0,col]=avgcol

sample_sub = pd.read_csv("../input/covid19-global-forecasting-week-5/submission.csv")
old_cols = sample_sub.columns
sample_sub.columns=["Id","Value"]

mysub=sample_sub.set_index('Id')

train["Test"]=0
test["Test"]=1

X_full = pd.concat((train[train.Date < "2020-04-27"], test[test.Date >= "2020-04-27"]),sort=True).reset_index(drop=True)

X_full["Reg"]=X_full["Ctry"].astype(str)+X_full["Prov"].astype(str)+X_full["Cty"].astype(str)
#pop["Reg"]=pop["Ctry"]+pop["Prov"].fillna("None")

X_full= X_full.merge(world, on=["Ctry"],how="left")
#X_full= X_full.merge(pop[["Pop","Reg"]], on=["Reg"],how="left")

X_full.loc[:,"GDPPerc"]= X_full.GDPPerc.astype("float")
X_full.loc[:,"GDPperCapita"]= X_full.GDPperCapita.astype("float")
X_full.loc[X_full.GDPperCapita == 0,"GDPperCapita"]=10000
    
X_full.fillna(0,inplace=True)

In [None]:
#X_full.drop(columns=["DollarPPP","Physicians","Nurses and midwives","Specialists"],inplace=True)
X_full["Test"]=pd.to_numeric(X_full["Test"],downcast="integer")
X_full["Pop"]=pd.to_numeric(X_full["Pop"],downcast="integer")
X_full["Value"]=pd.to_numeric(X_full["Value"],downcast="integer")

X_full.info()


In [None]:
sample_sub.head()

In [None]:
pd.set_option('display.max_rows', None)
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor


def model_fit(model,X,target_col,folds=3):
    
    kf = KFold(folds, shuffle=True, random_state=4991)
    
    drop_cols = set(X.columns).intersection({"Test","Id","Value","LogD","Week","Day","Date",
                                             "Prov","FirstDate","DayYear","Sub","Ctry","Cty","Reg","Type",
                                             "Pred","Res","Cluster","Cluster1","Pop","TotalCases","index","Weight",target_col})
    
    # create predictors for each region
    
    X_r=X
    X["RegType"]=X.Type+X.Reg
    X["WDay"] = (X.DayYear % 7).astype(str)
    X_r =pd.get_dummies(X.copy(),columns=["RegType","WDay"])
    
    # add an indicator for countries/states with a significant number of cases
    
    for col in X_r.columns:
        if col[:8]== "RegType_":
            regtype = col[:8]
            if X.loc[(X.RegType==regtype) & (X.DayYear == (107)),"TotalCases"].mean() > 25000:
                
                X_r[col+"1"]=X_r["Week1"]*X_r[col]
                X_r[col+"2"]=X_r["Week2"]*X_r[col]
                X_r[col+"3"]=X_r["Week3"]*X_r[col]
                          
    
    # add interactions with health spending indicators 
    # the relationship is very weak but I have kept these features in the model for now
    
    inter_features ={"GDPPerc","Week1","Week2","Week3","Age65Perc","GDPperCapita","LogPop"}.difference(set(drop_cols)).intersection(set(X.columns)) 
    poly = PolynomialFeatures(degree=2,include_bias=False) 
    inter_cols = poly.fit_transform(X[inter_features])
    X1= pd.DataFrame(inter_cols,columns= poly.get_feature_names(list(inter_features)),index=X.index)                            
    X_r = pd.concat([X1,X_r.drop(columns=inter_features)],axis=1)
    
    X_train = X_r[X_r.Test==0].drop(columns=drop_cols).copy()
    
    y_train = X.loc[X_r.Test==0,target_col]
    #print(X_train.columns)
    model.fit(X_train,y_train)
    X["Pred"] = np.maximum(model.predict(X_r.drop(columns=drop_cols)),0)
    X["Res"]= X.Pred-X[target_col]
    score = (-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv = kf))**0.5
    
    return X, score


model =Ridge(alpha=0.0065,random_state=35591,max_iter=10000,fit_intercept=True,normalize=True)
#model =Ridge(alpha=0.01,random_state=35591,max_iter=10000,fit_intercept=True,normalize=True)


graph = []

# create separate predictions for the public and private leaderboard (i.e. remove values from 1/4 from public submission)

last_train = "2020-05-10"    
X_full.loc[(X_full.Date > "2020-04-26") & (X_full.Date <= last_train),"Value"]=train.loc[
    (train.Date > "2020-04-26") & (train.Date <= last_train),"Value"].values

X_full["DayYear"]=X_full.Date.dt.dayofyear
######## Data adjustments for Ecuador

X_full.loc[(X_full.Type=="ConfirmedCases") & (X_full.Ctry=="Ecuador") & X_full.DayYear.isin([115,116,117]),"Value"] = [2000,2000,2000]
X_full.loc[(X_full.Type=="Fatalities") & (X_full.Ctry=="Ecuador") & X_full.DayYear.isin([115,116,117]),"Value"] = [100,100,100]

# reset sample sub

mysub=sample_sub.set_index('Id')
addsub=None

for loop in ["Public","Private"]:
    
    if loop == "Public":
        start=116 # last day of train data 
        val_end=117+14 # end date for validation data (public submission only )
        sub_start=118
        sub_end=118+13
        startint= 107 # fit more recent data only - this is a key parameter
    
        X_full.loc[:,"Test"] = (X_full.Date > "2020-04-26") *1
        
    else:
        start=117+14
        sub_start=118+14
        sub_end=999
        X_full.loc[:,"Test"] = (X_full.Date > "2020-05-10") *1 
        val_end=117+14
        startint= 107 # fit more recent data only - this is a key parameter
    
    
    
    
    
    for cl in set({"New"}): #set(X_full.Cluster):
        X_all= X_full[X_full.Ctry!="US1"].reset_index()
        cum_cases_map= X_all[["Type","Value","Reg","Date"]].groupby(["Type","Reg","Date"]).sum().groupby(level=1).cumsum().to_dict()["Value"]
        X_all["TotalCases"]= X_all.apply(lambda x: cum_cases_map[x["Type"],x["Reg"],x["Date"]],axis=1)
        X_reg= X_all[~((X_all.Test ==0) & (((X_all.TotalCases < np.maximum(X_all.Pop*0.00001,3)) & (X_all.Type=="ConfirmedCases")) | 
                       ((X_all.TotalCases <10) & (X_all.Type=="Fatalities"))))].copy()

        X_reg["Date"]= pd.to_datetime(X_reg.Date,infer_datetime_format=True)
        first_p_map= X_reg.loc[X_reg.Type=="ConfirmedCases",["Reg","Date"]].groupby("Reg").min().to_dict()["Date"]
        X_reg["FirstDate"]=X_reg["Reg"].map(first_p_map)

        X_reg["Day"]=(X_reg.Date-X_reg.FirstDate).dt.days
        X_reg["Week"]=X_reg.Day/7
        
        X_reg.Value= np.maximum(X_reg.Value,0)

        X_reg["LogVal"]= np.log(X_reg.Value+1)
        X_reg["LogPop"]= np.log(X_reg.Pop+1)

        X_reg["Week1"]=np.tanh((X_reg.Week)/10)
        X_reg["Week2"]=np.tanh((X_reg.Week)/10*3)
        X_reg["Week3"]=np.tanh((X_reg.Week)/10*5)
        X_reg["Week4"]=np.tanh((X_reg.Week-1)/10)
        X_reg["Week5"]=np.tanh((X_reg.Week-3)/10)
        X_reg["Week6"]=np.tanh((X_reg.Week-5)/10)
        X_reg["Week7"]=np.tanh((X_reg.Week-7)/10)
        X_reg["Week9"]=np.tanh((X_reg.Week-9)/10)
        X_reg["Sub"]=loop

        X= X_reg[X_reg.DayYear >= startint].copy()
        if cl[:2] != "US":
            print("\nLast day of year for train",start,": ",X.loc[X.DayYear== start,"Date"].min())
            print("First day of year for test",start+1,": ",X.loc[X.Test == 1,"Date"].min())
        
        # fit regressmodel to log of cases to align with evaluation metric
        if len(X[(X.Test==0) & (X.Type == "ConfirmedCases")]) > 0:
            X_res, score = model_fit(model,X.loc[(X.Type == "ConfirmedCases")].copy(),"LogVal")
            X.loc[(X.Type == "ConfirmedCases") ,"Pred"]=np.exp(X_res.Pred)-1
            print("ConfirmedCases complete")
            X_res, score = model_fit(model,X.loc[(X.Type == "Fatalities")].copy(),"LogVal")
            X.loc[(X.Type == "Fatalities") ,"Pred"]=np.exp(X_res.Pred)-1
        else:
            X["Pred"]=0

        # average last two observation to scale data
        scalemap =X.loc[(X.Test==0) & (X.DayYear >= start-2),["Type","Value","Reg"]].groupby(["Type","Reg"]).mean().to_dict()["Value"]
        scalepmap =X.loc[(X.Test==0) & (X.DayYear >= start-2),["Type","Pred","Reg"]].groupby(["Type","Reg"]).mean().to_dict()["Pred"]
        scalemap.setdefault("Reg",)
        X.loc[X.Ctry!="US","Pred"]= X.loc[X.Ctry!="US","Pred"]* np.maximum(X.apply(lambda x: scalemap[x["Type"],x["Reg"]]
                    if (x["Type"],x["Reg"]) in scalemap.keys() else 0,axis=1).fillna(0),1)/np.maximum(X.apply
                    (lambda x: scalepmap[x["Type"],x["Reg"]] if (x["Type"],x["Reg"]) in scalepmap.keys() else 0,axis=1).fillna(0),1)
        
        print("Model complete")
        X["Res"]=X.Value-X.Pred
        X.loc[X.Test==1,"Res"]=0
        # include first test day to have full data
        stdresmap =X.loc[X.DayYear <=sub_start,["Type","Res","Reg"]].groupby(["Type","Reg"]).std().to_dict()["Res"]
        meanmap =X.loc[X.DayYear <=sub_start,["Type","Value","Reg"]].groupby(["Type","Reg"]).mean().to_dict()["Value"]
        X["Mean"]= np.maximum(X.apply(lambda x: meanmap[x["Type"],x["Reg"]],axis=1),1)
        stdmap =X.loc[X.DayYear <=sub_start,["Type","Pred","Reg"]].groupby(["Type","Reg"]).std().to_dict()["Pred"]
        X["Std"]= np.maximum(X.apply(lambda x: stdmap[x["Type"],x["Reg"]],axis=1),1) 
        X["StdRes"]= np.maximum(X.apply(lambda x: stdresmap[x["Type"],x["Reg"]],axis=1),1)
          
        X["Pred05"]=np.maximum(X.Pred - (X.Std+X.StdRes)*2 /np.sqrt(X.Mean)*np.sqrt(X.Pred),0)
        X["Pred95"]=np.maximum(X.Pred + (X.Std+X.StdRes)*2 /np.sqrt(X.Mean)*np.sqrt(X.Pred),0)
        
        #print("\nCV score: ",score,"\nMean: {:.4f} Std: {:.4f}\n".format(score.mean(), score.std()))

        X.fillna(0,inplace=True)
        
        X["ResLV"] = (X.Pred-X.Value)*X.Weight
        X.loc[X.DayYear > val_end,"ResLV"]=0
        
        if loop == "Public": 
            #print(X.loc[(X.DayYear > start) & (X.DayYear <=val_end),["ResLV","Pred","Value"]].describe())
            print(cl,": ",loop," submission - Score: ",np.sqrt(np.abs(X.loc[(X.DayYear >= sub_start) & (X.DayYear <=val_end),["ResLV"]]).mean()))
        X["Target"]=X.Pred
        X["Quantile"]=0.5
        X["IdSub"]=X.Id.astype(str)+"_0.5"
        addsub = pd.concat([addsub,X.loc[X.Test==1,["IdSub","Target","ResLV","Quantile","Type","DayYear","Ctry","Prov","Cty","Sub"]]])
        X["IdSub"]=X.Id.astype(str)+"_0.05"
        X.Quantile=0.05
        X.Target=X.Pred05
        X.ResLV=(X.Pred05-X.Value)*X.Weight
        addsub = pd.concat([addsub,X.loc[X.Test==1,["IdSub","Target","ResLV","Quantile","Type","DayYear","Ctry","Prov","Cty","Sub"]]])
        X["IdSub"]=X.Id.astype(str)+"_0.95"
        X.Quantile=0.95
        X.Target=X.Pred95
        X.ResLV=(X.Pred95-X.Value)*X.Weight
        addsub = pd.concat([addsub,X.loc[X.Test==1,["IdSub","Target","ResLV","Quantile","Type","DayYear","Ctry","Prov","Cty","Sub"]]])
        addsub.fillna(0,inplace=True)
                      
    addsub["Err"]= (addsub.ResLV > 0)*(1-addsub.Quantile)*(addsub.ResLV)+(addsub.ResLV <= 0)*(addsub.Quantile)*(-addsub.ResLV) 
    print ("Final Score",addsub.loc[addsub.DayYear <= val_end,"Err"].mean())
    

In [None]:
fig = px.scatter(addsub[(addsub.Ctry=="Austria") & (addsub.Type == "ConfirmedCases")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Ctry=="Austria") & (addsub.Type == "Fatalities")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Ctry=="United Kingdom") & (addsub.Type == "ConfirmedCases")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Ctry=="United Kingdom") & (addsub.Type == "Fatalities")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Ctry=="Russia") & (addsub.Type == "ConfirmedCases")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Ctry=="Russia") & (addsub.Type == "Fatalities")], x='DayYear', y='Target', color="Sub")
fig.show()


In [None]:
addsub.loc[(addsub.DayYear > 117+14) & (addsub.Ctry == "US"),["Target","Type","Sub","Quantile","Prov"]].groupby(["Prov","Type","Quantile","Sub"]).sum()

In [None]:
fig = px.scatter(addsub[(addsub.Cty=="New York") & (addsub.Type == "ConfirmedCases")], x='DayYear', y='Target', color="Sub")
fig.show()
fig = px.scatter(addsub[(addsub.Cty=="New York") & (addsub.Type == "Fatalities")], x='DayYear', y='Target', color="Sub")
fig.show()

In [None]:
addsub[addsub.Sub=="Private"].describe()

In [None]:
addsub[addsub.Sub=="Public"].describe()

### Create submission file

In [None]:
mysub=sample_sub.set_index('Id')
finalsub = addsub[((addsub.Sub=="Public") & (addsub.DayYear < 132)) | ((addsub.Sub=="Private") & (addsub.DayYear >= 132))]
mysub= mysub.merge(finalsub[["IdSub","Target"]],left_index=True,right_on="IdSub",how="left")
mysub.drop(columns=["Value"],inplace=True)
mysub.columns=["ForecastId_Quantile","TargetValue"]
mysub.fillna(0,inplace=True)
mysub.to_csv('submission.csv',index=False)

In [None]:
mysub.describe()

In [None]:
mysub.head(100)

In [None]:
train.head(300)