In [None]:


# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
        
from collections import OrderedDict
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from xgboost import XGBRegressor

In [None]:
N_FEATURES = 5

In [None]:
countries = pd.read_csv(
    '/kaggle/input/countries-of-the-world/countries of the world.csv', decimal=','
)

countries['Country'] = countries.Country.str.lower()

countries = countries.set_index('Country')



In [None]:
countries.head()

In [None]:
train_data = pd.read_csv(
    '/kaggle/input/covid19-global-forecasting-week-4/train.csv'
).set_index('Id')


In [None]:
def transform_area(data):
    data["Area"] = ""
    data.loc[~data.Province_State.isna(), "Area"] = data.loc[~data.Province_State.isna()].Country_Region.str.lower() + "/" + data.Province_State.str.lower()
    data.loc[data.Province_State.isna(), "Area"] = data.loc[data.Province_State.isna()].Country_Region.str.lower()
    data = data.drop('Province_State', axis='columns')
    return data

def transform_countries(data):
    data['Density'] = countries['Pop. Density (per sq. mi.)'].mean()
    data['InfantMortality'] = countries['Infant mortality (per 1000 births)'].mean()
    for i in data.index:
        country = data.loc[i, "Country_Region"].lower()
        if country in countries.index:
            data.loc[i, "Density"] = countries.loc[country, 'Pop. Density (per sq. mi.)']
    return data
    

In [None]:
train_data = transform_area(train_data)
train_data = transform_countries(train_data)
train_data.head()


In [None]:
train_data.info()


In [None]:


X_train = {}
X_test = {}
Y_train = {}
Y_test = {}
for area in list(set(train_data.Area)):
    print(area)
    area_data = train_data.loc[train_data.Area == area].set_index('Date').sort_index()
    Y = area_data[['ConfirmedCases', 'Fatalities']] - area_data.shift(1)[['ConfirmedCases', 'Fatalities']]
    dic = OrderedDict()
    for i in range(1, 1 + N_FEATURES):
        dic['CC_{}'.format(i)] = area_data.shift(i)['ConfirmedCases']
        dic['F_{}'.format(i)] = area_data.shift(i)['Fatalities']
    dic['Density'] = area_data["Density"]
    dic['InfantMortality'] = area_data["InfantMortality"]
    X = pd.DataFrame(dic, index=area_data.index)
    X = X.dropna(axis='index')
    Y = Y.loc[X.index]
    X_train[area] = X.iloc[:int(len(X) * 0.8)]
    X_test[area] = X.iloc[int(len(X) * 0.8):]
    Y_train[area] = Y.iloc[:int(len(X) * 0.8)]
    Y_test[area] = Y.iloc[int(len(X) * 0.8):]



In [None]:
X_train = np.vstack(list(X_train.values()))
X_test = np.vstack(list(X_test.values()))
Y_train = np.vstack(list(Y_train.values()))
Y_test = np.vstack(list(Y_test.values()))


In [None]:
Y_train.shape

In [None]:
y_train = Y_train[:, 0]
y_test = Y_test[:, 0]
ridge_cc = Pipeline(
    [
        ('ss', StandardScaler()),
        ('ridge', RidgeCV())
    ]
)
#ridge_cc = XGBRegressor(random_state=0, n_estimators=200)
ridge_cc.fit(X_train, y_train)
print(ridge_cc.score(X_train, y_train))
print(ridge_cc.score(X_test, y_test))



In [None]:
y_train = Y_train[:, 1]
y_test = Y_test[:, 1]
ridge_f = Pipeline(
    [
        ('ss', StandardScaler()),
        ('ridge', RidgeCV())
    ]
)
ridge_f.fit(X_train, y_train)
print(ridge_f.score(X_train, y_train))
print(ridge_f.score(X_test, y_test))



In [None]:
test_data = pd.read_csv(
    '/kaggle/input/covid19-global-forecasting-week-4/test.csv'
).set_index('ForecastId')


In [None]:
test_data = transform_area(test_data)
test_data = transform_countries(test_data)
test_data.head()


In [None]:
test_data['ConfirmedCases'] = np.NaN
test_data['Fatalities'] = np.NaN


In [None]:
last_date = train_data.Date.max()
last_date


In [None]:
train_dates = set(train_data.Date)
test_dates = set(test_data.Date)
for i in test_data.loc[test_data.Date <= last_date].index:
    date = test_data.loc[i].Date
    if date in set(train_data.Date):
        slc = train_data.loc[(train_data.Date == date) & (train_data.Area == test_data.loc[i, "Area"])]
        test_data.loc[i, "ConfirmedCases"] = slc["ConfirmedCases"].iloc[0]
        test_data.loc[i, "Fatalities"] = slc["Fatalities"].iloc[0]
        



In [None]:
test_data.info()


In [None]:
for area in sorted(list(set(test_data.Area))):
    print(area)
    area_data = test_data.loc[test_data.Area == area].set_index('Date').sort_index()
    for i in area_data.index:
        if not np.isnan(area_data.loc[i, "ConfirmedCases"]) and not np.isnan(area_data.loc[i, "Fatalities"]):
            continue
        x = np.zeros(2 * N_FEATURES + 2)
        for j in range(1, 1 + N_FEATURES):
            x[j * 2 - 2] = area_data.shift(j).loc[i].ConfirmedCases
            x[j * 2 - 1] = area_data.shift(j).loc[i].Fatalities
        x[2 * N_FEATURES] = area_data.loc[i].Density
        x[2 * N_FEATURES + 1] = area_data.loc[i].InfantMortality
        x = x.reshape(1, -1)
        test_data.loc[(test_data.Area == area) & (test_data.Date == i), "ConfirmedCases"] = ridge_cc.predict(x)[0] + area_data.shift(1).loc[i, "ConfirmedCases"]
        test_data.loc[(test_data.Area == area) & (test_data.Date == i), "Fatalities"] = ridge_f.predict(x)[0] + area_data.shift(1).loc[i, "Fatalities"]
        area_data.loc[i, "ConfirmedCases"] = ridge_cc.predict(x)[0] + area_data.shift(1).loc[i, "ConfirmedCases"]
        area_data.loc[i, "Fatalities"] = ridge_f.predict(x)[0] + area_data.shift(1).loc[i, "Fatalities"]



In [None]:
test_data.info()


In [None]:
test_data.head()


In [None]:


submission = pd.read_csv(
    "/kaggle/input/covid19-global-forecasting-week-4/submission.csv"
).set_index('ForecastId')



In [None]:


submission.head()



In [None]:


submission['ConfirmedCases'] = test_data.loc[submission.index, "ConfirmedCases"]



In [None]:


submission['Fatalities'] = test_data.loc[submission.index, "Fatalities"]



In [None]:


submission.loc[submission.ConfirmedCases < 0, "ConfirmedCases"] = 0
submission.loc[submission.Fatalities < 0, "Fatalities"] = 0



In [None]:
submission['ConfirmedCases'] = submission.ConfirmedCases.astype(int)
submission['Fatalities'] = submission.Fatalities.astype(int)

In [None]:
submission.head(50)

In [None]:


submission.to_csv('submission.csv')



In [None]:
test_data.loc[test_data.Area == 'russia']