# XGBoost Models

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing imputed data

USA_train_imputed = pd.read_csv("USA_train_imputed copy.csv")
USA_test_imputed = pd.read_csv("USA_test_imputed copy.csv")
USA_train_imputed.set_index('date', inplace=True)
USA_test_imputed.set_index('date', inplace=True)

Brazil_train_imputed = pd.read_csv("Brazil_train_imputed copy.csv")
Brazil_test_imputed = pd.read_csv("Brazil_test_imputed copy.csv")
Brazil_train_imputed.set_index('date', inplace=True)
Brazil_test_imputed.set_index('date', inplace=True)

Germany_train_imputed = pd.read_csv("Germany_train_imputed copy.csv")
Germany_test_imputed = pd.read_csv("Germany_test_imputed copy.csv")
Germany_train_imputed.set_index('date', inplace=True)
Germany_test_imputed.set_index('date', inplace=True)

Nigeria_train_imputed = pd.read_csv("Nigeria_train_imputed copy.csv")
Nigeria_test_imputed = pd.read_csv("Nigeria_test_imputed copy.csv")
Nigeria_train_imputed.set_index('date', inplace=True)
Nigeria_test_imputed.set_index('date', inplace=True)

China_train_imputed = pd.read_csv("China_train_imputed copy.csv")
China_test_imputed = pd.read_csv("China_test_imputed copy.csv")
China_train_imputed.set_index('date', inplace=True)
China_test_imputed.set_index('date', inplace=True)

Australia_train_imputed = pd.read_csv("Australia_train_imputed copy.csv")
Australia_test_imputed = pd.read_csv("Australia_test_imputed copy.csv")
Australia_train_imputed.set_index('date', inplace=True)
Australia_test_imputed.set_index('date', inplace=True)

## Dropping columns
After building the Prophet models, we determined that there were a number of variables in each country's data that had unique/constant values or were collinear with the target variable of 'new_cases'. We will drop these samme variables as well for developing the XGBoost model. You will see this later in the code, this is just an early note.

# USA model

In [3]:
# Adding time lags as features
lag_values = [1, 7, 14]

for lag in lag_values:
    # Create lag features for training set
    for column in USA_train_imputed.columns:
        if column != 'new_cases':  # Skip the target variable
            USA_train_imputed[f'{column}_lag_{lag}'] = USA_train_imputed[column].shift(lag)

    # Create lag features for test set
    for column in USA_test_imputed.columns:
        if column != 'new_cases':  # Skip the target variable
            USA_test_imputed[f'{column}_lag_{lag}'] = USA_test_imputed[column].shift(lag)

In [5]:
# Checking the dataset for unique values in each column
unique_counts = USA_train_imputed.nunique()

# Identifying columns with no unique values
columns_with_no_unique_values = unique_counts[unique_counts == 1].index

In [6]:
# Dropping columns based on work above
USA_train_imputed = USA_train_imputed.drop(columns = columns_with_no_unique_values)
USA_train_imputed = USA_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
USA_train_imputed['new_cases_Lag_7'] = USA_train_imputed['new_cases_Lag_7'].fillna(0)
USA_train_imputed['new_cases_Lag_14'] = USA_train_imputed['new_cases_Lag_14'].fillna(0)
USA_train_imputed["Weekend"] = USA_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
USA_test_imputed = USA_test_imputed.drop(columns = columns_with_no_unique_values)
USA_test_imputed = USA_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
USA_test_imputed['new_cases_Lag_7'][0:7] = USA_test_imputed['new_cases'][-7:]
USA_test_imputed['new_cases_Lag_14'][0:14] = USA_test_imputed['new_cases'][-14:]
USA_test_imputed["Weekend"] = USA_test_imputed["Weekend"].astype(int)

In [7]:
# # Filling missing values of the lag features
# def create_lag_features(df, lag_values=[1]):
#     ret = df[['new_cases']]  # Assuming 'new_cases' is your target variable

#     for lag in lag_values:
#         for column in df.columns:
#             if column != 'new_cases':  # Skip the target variable
#                 lagdf = df[column].shift(lag)
#                 lagdf.columns = [f'{column}_lag_{lag}']
#                 ret = pd.concat([ret, lagdf], axis=1)
#     return ret.fillna(0)

# # Example for USA dataset with lag values 1, 7, 14, 30
# lag_values = [1, 7, 14, 30]
# USA_train_imputed_lagged = create_lag_features(USA_train_imputed, lag_values)
# USA_test_imputed_lagged = create_lag_features(USA_test_imputed, lag_values)

In [8]:
# Filling missing values of the lag features
def process(df, lag_values=[1]):
    ret = df[['new_cases']]
    for lag in lag_values:
        lagdf = df.shift(lag)
        lagdf.columns=[f'lag{lag}_' + str(col) for col in lagdf.columns]
        ret=pd.concat([ret, lagdf], axis=1)
    return ret.fillna(0) 
USA_train_imputed_lagged = process(USA_train_imputed, lag_values=[1,7,14,30])
USA_test_imputed_lagged = process(USA_test_imputed, lag_values=[1,7,14,30])

In [9]:
USA_train_imputed_lagged

Unnamed: 0_level_0,new_cases,lag1_stringency_index,lag1_new_cases,lag1_total_deaths_per_million,lag1_new_deaths,lag1_total_deaths,lag1_new_deaths_per_million,lag1_total_tests_per_thousand,lag1_new_tests,lag1_total_tests,...,lag30_total_vaccinations_lag_1_lag_7_lag_14,lag30_new_vaccinations_lag_1_lag_7_lag_14,lag30_new_vaccinations_smoothed_lag_1_lag_7_lag_14,lag30_new_people_vaccinated_smoothed_per_hundred_lag_1_lag_7_lag_14,lag30_new_people_vaccinated_smoothed_lag_1_lag_7_lag_14,lag30_new_vaccinations_smoothed_per_million_lag_1_lag_7_lag_14,lag30_new_cases_Lag_7_lag_1_lag_7_lag_14,lag30_new_cases_Lag_14_lag_1_lag_7_lag_14,lag30_Rolling_Mean_new_cases_lag_1_lag_7_lag_14,lag30_Weekend_lag_1_lag_7_lag_14
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22,0.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-23,0.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-24,1.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-25,0.0,0.00,1.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-26,3.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,509081.0,47.69,176889.0,2460.174,197.0,819029.0,0.592,2144.998,961271.0,714102068.0,...,437241247.0,1893414.0,1319843.0,0.085,281129.0,3975.0,95118.0,96372.0,72568.642857,False
2021-12-28,356797.0,50.46,509081.0,2465.857,1892.0,820921.0,5.683,2150.086,1694071.0,715796139.0,...,438267689.0,1026442.0,1350210.0,0.094,312730.0,4067.0,33855.0,31825.0,72767.071429,True
2021-12-29,499452.0,50.46,356797.0,2473.078,2404.0,823325.0,7.221,2156.937,2280780.0,718076919.0,...,438818418.0,550729.0,1366855.0,0.100,330791.0,4117.0,32018.0,18560.0,74034.357143,True
2021-12-30,589431.0,50.46,499452.0,2480.041,2318.0,825643.0,6.963,2163.853,2302224.0,720379143.0,...,440266939.0,1448521.0,1367682.0,0.108,357315.0,4119.0,111463.0,111113.0,73885.357143,False


In [10]:
# For training data
X_train = USA_train_imputed_lagged.drop(columns=['new_cases'])
y_train = USA_train_imputed_lagged['new_cases']

# For testing data
X_test = USA_test_imputed_lagged.drop(columns=['new_cases'])
y_test = USA_test_imputed_lagged['new_cases']

In [11]:
!pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in c:\users\ryanc\anaconda3\lib\site-packages (1.3.2)


In [12]:
!pip install numpy==1.17.3



In [13]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter distributions
param_dist = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 100, 200, 300, 500]
}

# Create XGBoost model
xgb_model = xgb.XGBRegressor()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=5, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Train the final model with the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ryanc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 297, in _from_pandas_df
    data, feature_names, feature_types = _transform_pandas_df(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 242, in _transform_pandas_df
    raise ValueError(msg + ', '.join(bad_fields))
ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
                categorical type is supplied, DMatrix parameter `enable_categorical` must
                be set to `True`.lag1_Weekend_lag_1, lag1_Weekend_lag_7, lag1_Weekend_lag_1_lag_7, lag1_Weekend_lag_14, lag1_Weekend_lag_1_lag_14, lag1_Weekend_lag_7_lag_14, lag1_Weekend_lag_1_lag_7_lag_14, lag7_Weekend_lag_1, lag7_Weekend_lag_7, lag7_Weekend_lag_1_lag_7, lag7_Weekend_lag_14, lag7_Weekend_lag_1_lag_14, lag7_Weekend_lag_7_lag_14, lag7_Weekend_lag_1_lag_7_lag_14, lag14_Weekend_lag_1, lag14_Weekend_lag_7, lag14_Weekend_lag_1_lag_7, lag14_Weekend_lag_14, lag14_Weekend_lag_1_lag_14, lag14_Weekend_lag_7_lag_14, lag14_Weekend_lag_1_lag_7_lag_14, lag30_Weekend_lag_1, lag30_Weekend_lag_7, lag30_Weekend_lag_1_lag_7, lag30_Weekend_lag_14, lag30_Weekend_lag_1_lag_14, lag30_Weekend_lag_7_lag_14, lag30_Weekend_lag_1_lag_7_lag_14


In [5]:
# Checking the dataset for unique values in each column
unique_counts = USA_train_imputed.nunique()

# Identifying columns with no unique values
columns_with_no_unique_values = unique_counts[unique_counts == 1].index
print(columns_with_no_unique_values)

Index(['iso_code', 'population_density', 'median_age', 'aged_65_older',
       'aged_70_older', 'gdp_per_capita', 'extreme_poverty',
       'cardiovasc_death_rate', 'hospital_beds_per_thousand',
       'life_expectancy',
       ...
       'aged_70_older_lag_1_lag_7_lag_14', 'gdp_per_capita_lag_1_lag_7_lag_14',
       'extreme_poverty_lag_1_lag_7_lag_14',
       'cardiovasc_death_rate_lag_1_lag_7_lag_14',
       'hospital_beds_per_thousand_lag_1_lag_7_lag_14',
       'life_expectancy_lag_1_lag_7_lag_14',
       'human_development_index_lag_1_lag_7_lag_14',
       'location_lag_1_lag_7_lag_14', 'continent_lag_1_lag_7_lag_14',
       'population_lag_1_lag_7_lag_14'],
      dtype='object', length=112)


In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter distributions
param_dist = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 100, 200, 300, 500]
}

# Create XGBoost model
xgb_model = xgb.XGBRegressor()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='root_mean_squared_error', cv=5, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Train the final model with the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

In [None]:





USA_multivariate_train_imputed['new_cases_Lag_7'] = USA_multivariate_train_imputed ['new_cases_Lag_7'].fillna(0)
USA_multivariate_train_imputed['new_cases_Lag_14'] = USA_multivariate_train_imputed ['new_cases_Lag_14'].fillna(0)
USA_multivariate_train_imputed['new_cases_Lag_1'] = USA_multivariate_train_imputed ['new_cases_Lag_1'].fillna(0)
USA_multivariate_train_imputed['new_cases_Lag_2'] = USA_multivariate_train_imputed ['new_cases_Lag_2'].fillna(0)
USA_multivariate_train_imputed["Weekend"] = USA_multivariate_train_imputed["Weekend"].astype(int)


USA_multivariate_test_imputed = USA_test_imputed
USA_multivariate_test_imputed = USA_multivariate_test_imputed.drop(columns=['iso_code', 'population_density'
                                                                             ,'median_age', 'aged_65_older', 'aged_70_older',
                                                                             'gdp_per_capita','extreme_poverty',
                                                                              'cardiovasc_death_rate','hospital_beds_per_thousand',
                                                                             'life_expectancy','human_development_index','total_cases',
                                                                             'location', 'continent', 'population', 'reproduction_rate', 'new_cases_smoothed',
                                                                           'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million'])
USA_multivariate_test_imputed['new_cases_Lag_7'][0:7] = USA_multivariate_train_imputed['new_cases'][-7:]
USA_multivariate_test_imputed['new_cases_Lag_14'][0:14] = USA_multivariate_train_imputed['new_cases'][-14:]
USA_multivariate_test_imputed['new_cases_Lag_1'][0:1] = USA_multivariate_train_imputed['new_cases'][-1:]
USA_multivariate_test_imputed['new_cases_Lag_2'][0:2] = USA_multivariate_train_imputed['new_cases'][-2:]
USA_multivariate_test_imputed["Weekend"] = USA_multivariate_test_imputed["Weekend"].astype(int)

In [None]:
# Example for one country (USA)
X_train = USA_train_imputed.drop('new_cases', axis=1)  # Features for training
y_train = USA_train_imputed['new_cases']  # Target variable for training

X_test = USA_test_imputed.drop('new_cases', axis=1)  # Features for testing
y_test = USA_test_imputed['new_cases']  # Target variable for testing

# Create and train the XGBoost model
model = XGBRegressor()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

In [None]:
# Dropping columns that have collinearity with 'new_cases'
Brazil_train_imputed = Brazil_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
Brazil_train_imputed['new_cases_Lag_7'] = Brazil_train_imputed['new_cases_Lag_7'].fillna(0)
Brazil_train_imputed['new_cases_Lag_14'] = Brazil_train_imputed['new_cases_Lag_14'].fillna(0)
Brazil_train_imputed["Weekend"] = Brazil_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
Brazil_test_imputed = Brazil_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
Brazil_test_imputed['new_cases_Lag_7'][0:7] = Brazil_test_imputed['new_cases'][-7:]
Brazil_test_imputed['new_cases_Lag_14'][0:14] = Brazil_test_imputed['new_cases'][-14:]
Brazil_test_imputed["Weekend"] = Brazil_test_imputed["Weekend"].astype(int)

In [None]:
# Dropping columns that have collinearity with 'new_cases'
Germany_train_imputed = Germany_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
Germany_train_imputed['new_cases_Lag_7'] = Germany_train_imputed['new_cases_Lag_7'].fillna(0)
Germany_train_imputed['new_cases_Lag_14'] = Germany_train_imputed['new_cases_Lag_14'].fillna(0)
Germany_train_imputed["Weekend"] = Germany_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
Germany_test_imputed = Germany_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
Germany_test_imputed['new_cases_Lag_7'][0:7] = Germany_test_imputed['new_cases'][-7:]
Germany_test_imputed['new_cases_Lag_14'][0:14] = Germany_test_imputed['new_cases'][-14:]
Germany_test_imputed["Weekend"] = Germany_test_imputed["Weekend"].astype(int)

In [None]:
# Dropping columns that have collinearity with 'new_cases'
China_train_imputed = China_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
China_train_imputed['new_cases_Lag_7'] = China_train_imputed['new_cases_Lag_7'].fillna(0)
China_train_imputed['new_cases_Lag_14'] = China_train_imputed['new_cases_Lag_14'].fillna(0)
China_train_imputed["Weekend"] = China_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
China_test_imputed = China_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
China_test_imputed['new_cases_Lag_7'][0:7] = China_test_imputed['new_cases'][-7:]
China_test_imputed['new_cases_Lag_14'][0:14] = China_test_imputed['new_cases'][-14:]
China_test_imputed["Weekend"] = China_test_imputed["Weekend"].astype(int)

In [None]:
# Dropping columns that have collinearity with 'new_cases'
Nigeria_train_imputed = Nigeria_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
Nigeria_train_imputed['new_cases_Lag_7'] = Nigeria_train_imputed['new_cases_Lag_7'].fillna(0)
Nigeria_train_imputed['new_cases_Lag_14'] = Nigeria_train_imputed['new_cases_Lag_14'].fillna(0)
Nigeria_train_imputed["Weekend"] = Nigeria_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
Nigeria_test_imputed = Nigeria_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
Nigeria_test_imputed['new_cases_Lag_7'][0:7] = Nigeria_test_imputed['new_cases'][-7:]
Nigeria_test_imputed['new_cases_Lag_14'][0:14] = Nigeria_test_imputed['new_cases'][-14:]
Nigeria_test_imputed["Weekend"] = Nigeria_test_imputed["Weekend"].astype(int)

In [None]:
# Dropping columns that have collinearity with 'new_cases'
Australia_train_imputed = Australia_train_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
Australia_train_imputed['new_cases_Lag_7'] = Australia_train_imputed['new_cases_Lag_7'].fillna(0)
Australia_train_imputed['new_cases_Lag_14'] = Australia_train_imputed['new_cases_Lag_14'].fillna(0)
Australia_train_imputed["Weekend"] = Australia_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
Australia_test_imputed = Australia_test_imputed.drop(columns = ['total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                            'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
Australia_test_imputed['new_cases_Lag_7'][0:7] = Australia_test_imputed['new_cases'][-7:]
Australia_test_imputed['new_cases_Lag_14'][0:14] = Australia_test_imputed['new_cases'][-14:]
Australia_test_imputed["Weekend"] = Australia_test_imputed["Weekend"].astype(int)

In [None]:
# Combine train datasets
train_datasets = [USA_train_imputed, Brazil_train_imputed, Germany_train_imputed, Nigeria_train_imputed, China_train_imputed, Australia_train_imputed]
composite_train = pd.concat(train_datasets, axis=0)

# Resetting the index after concatenation
composite_train.reset_index(inplace=True)

# Combine test datasets
test_datasets = [USA_test_imputed, Brazil_test_imputed, Germany_test_imputed, Nigeria_test_imputed, China_test_imputed, Australia_test_imputed]
composite_test = pd.concat(test_datasets, axis=0)

# Resetting the index after concatenation
composite_test.reset_index(inplace=True)

# Setting 'date' as the index for both composite_train and composite_test
composite_train.set_index('date', inplace=True)
composite_test.set_index('date', inplace=True)

In [None]:
# Dropping more columns, this time the ones with constant values as determined in the Prophet model development process
composite_train = composite_train.drop(columns = ['population_density', 'population', 'location', 'continent',
                                                  'life_expectancy', 'human_development_index', 'median_age', 
                                                  'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'cardiovasc_death_rate', 
                                                  'extreme_poverty', 'hospital_beds_per_thousand', 'handwashing_facilities'])


composite_test = composite_test.drop(columns = ['population_density', 'population', 'location', 'continent',
                                                  'life_expectancy', 'human_development_index', 'median_age', 
                                                  'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'cardiovasc_death_rate', 
                                                  'extreme_poverty', 'hospital_beds_per_thousand', 'handwashing_facilities'])

In [None]:
# Transform the time series data into a supervised learning format, considering time lags as features.
def create_lag_features(df, columns, lag_values):
    ret = df[columns].copy()
    for column in columns:
        for lag in lag_values:
            lagged_column = df[column].shift(lag)
            lagged_column.name = f'{column}_lag_{lag}'
            ret = pd.concat([ret, lagged_column], axis=1)
    return ret.fillna(0)

# Specify the lag values explicitly. Using lags of 1,7, 14 & 30 lags for each feature.
lag_values = [1, 7, 14, 30]  # Adjust as needed

# List of dataframes for both train and test
train_datasets = [composite_train]
test_datasets = [composite_test]

# Apply lag features to each train dataset
for train_dataset in train_datasets:
    train_dataset = create_lag_features(train_dataset, ['new_cases'], lag_values)

# Apply lag features to each test dataset
for test_dataset in test_datasets:
    test_dataset = create_lag_features(test_dataset, ['new_cases'], lag_values)

In [None]:
# USA_train_imputed.dtypes

In [None]:
m1_USA = xgb.XGBRegressor(learning_rate=0.01, max_depth=4, n_estimators=500, n_jobs=-1, random_state=0)
m1_USA.fit(USA_train_imputed.drop(columns = ['new_cases', 'iso_code', 'location', 'continent'], axis=1), USA_train_imputed['new_cases'])

In [None]:
train_pred = pd.DataFrame(m1_USA.predict(USA_train_imputed.drop(columns = ['new_cases', 'iso_code', 'location', 'continent'], axis=1)), columns=['yhat'], index=USA_train_imputed.index)
train_pred.head()

In [None]:
pd.concat([USA_train_imputed['new_cases'],train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['USA_train_imputed', 'train_pred'])

In [None]:
### Above models appears to overfit for USA train imputed dataset... Will need to check in with Rakin and Sid over the weekend