# XGBoost Models

In [1]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing imputed data

USA_train_imputed = pd.read_csv("USA_train_imputed copy.csv")
USA_test_imputed = pd.read_csv("USA_test_imputed copy.csv")
USA_train_imputed.set_index('date', inplace=True)
USA_test_imputed.set_index('date', inplace=True)

Brazil_train_imputed = pd.read_csv("Brazil_train_imputed copy.csv")
Brazil_test_imputed = pd.read_csv("Brazil_test_imputed copy.csv")
Brazil_train_imputed.set_index('date', inplace=True)
Brazil_test_imputed.set_index('date', inplace=True)

Germany_train_imputed = pd.read_csv("Germany_train_imputed copy.csv")
Germany_test_imputed = pd.read_csv("Germany_test_imputed copy.csv")
Germany_train_imputed.set_index('date', inplace=True)
Germany_test_imputed.set_index('date', inplace=True)

Nigeria_train_imputed = pd.read_csv("Nigeria_train_imputed copy.csv")
Nigeria_test_imputed = pd.read_csv("Nigeria_test_imputed copy.csv")
Nigeria_train_imputed.set_index('date', inplace=True)
Nigeria_test_imputed.set_index('date', inplace=True)

China_train_imputed = pd.read_csv("China_train_imputed copy.csv")
China_test_imputed = pd.read_csv("China_test_imputed copy.csv")
China_train_imputed.set_index('date', inplace=True)
China_test_imputed.set_index('date', inplace=True)

Australia_train_imputed = pd.read_csv("Australia_train_imputed copy.csv")
Australia_test_imputed = pd.read_csv("Australia_test_imputed copy.csv")
Australia_train_imputed.set_index('date', inplace=True)
Australia_test_imputed.set_index('date', inplace=True)

## Dropping columns
After building the Prophet models, we determined that there were a number of variables in each country's data that had unique/constant values or were collinear with the target variable of 'new_cases'. We will drop these samme variables as well for developing the XGBoost model. You will see this later in the code, this is just an early note.

# USA model

In [3]:
# Adding time lags as features
lag_values = [1, 7, 14, 30]

for lag in lag_values:
    # Create lag features for training set
    for column in USA_train_imputed.columns:
        if column != 'new_cases':  # Skip the target variable
            USA_train_imputed[f'{column}_lag_{lag}'] = USA_train_imputed[column].shift(lag)

    # Create lag features for test set
    for column in USA_test_imputed.columns:
        if column != 'new_cases':  # Skip the target variable
            USA_test_imputed[f'{column}_lag_{lag}'] = USA_test_imputed[column].shift(lag)

In [4]:
# Drop columns based on work from developing Prophet Multivariate model. 
# Drop columns with constant values (low variance) and collinearity with 'new_cases'
USA_train_imputed = USA_train_imputed.drop(columns = ['iso_code', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 
                                                      'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'hospital_beds_per_thousand',
                                                      'life_expectancy', 'human_development_index', 'location', 'continent', 'population', 
                                                      'total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Fix Lag Columns so values at beginning are 0 instead of missing
USA_train_imputed['new_cases_Lag_7'] = USA_train_imputed['new_cases_Lag_7'].fillna(0)
USA_train_imputed['new_cases_Lag_14'] = USA_train_imputed['new_cases_Lag_14'].fillna(0)
USA_train_imputed["Weekend"] = USA_train_imputed["Weekend"].astype(int)

# Dropping the same columns for test dataset based on conclusions of train dataset
USA_test_imputed = USA_test_imputed.drop(columns = ['iso_code', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 
                                                      'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'hospital_beds_per_thousand',
                                                      'life_expectancy', 'human_development_index', 'location', 'continent', 'population', 
                                                      'total_cases_per_million', 'total_cases', 'new_cases_per_million',
                                                      'new_cases_smoothed_per_million', 'new_cases_smoothed', 'Rolling_Mean_new_cases'])

# Ensuring that the lag column values in test dataset are those of last values in train
USA_test_imputed['new_cases_Lag_7'][0:7] = USA_train_imputed['new_cases'][-7:]
USA_test_imputed['new_cases_Lag_14'][0:14] = USA_train_imputed['new_cases'][-14:]
USA_test_imputed["Weekend"] = USA_test_imputed["Weekend"].astype(int)

In [5]:
# Filling missing values of the lag features
def process(df, lag_values=[1]):
    ret = df[['new_cases']]
    for lag in lag_values:
        lagdf = df.shift(lag)
        lagdf.columns=[f'lag{lag}_' + str(col) for col in lagdf.columns]
        ret=pd.concat([ret, lagdf], axis=1)
    return ret.fillna(0) 
USA_train_imputed_lagged = process(USA_train_imputed, lag_values=[1,7,14,30])
USA_test_imputed_lagged = process(USA_test_imputed, lag_values=[1,7,14,30])

In [6]:
# # Filling missing values of the lag features
# def create_lag_features(df, lag_values=[1]):
#     ret = df[['new_cases']]  # Assuming 'new_cases' is your target variable

#     for lag in lag_values:
#         for column in df.columns:
#             if column != 'new_cases':  # Skip the target variable
#                 lagdf = df[column].shift(lag)
#                 lagdf.columns = [f'{column}_lag_{lag}']
#                 ret = pd.concat([ret, lagdf], axis=1)
#     return ret.fillna(0)

# # Example for USA dataset with lag values 1, 7, 14, 30
# lag_values = [1, 7, 14, 30]
# USA_train_imputed_lagged = create_lag_features(USA_train_imputed, lag_values)
# USA_test_imputed_lagged = create_lag_features(USA_test_imputed, lag_values)

In [7]:
USA_train_imputed_lagged

Unnamed: 0_level_0,new_cases,lag1_stringency_index,lag1_new_cases,lag1_total_deaths_per_million,lag1_new_deaths,lag1_total_deaths,lag1_new_deaths_per_million,lag1_total_tests_per_thousand,lag1_new_tests,lag1_total_tests,...,lag30_total_vaccinations_lag_1_lag_7_lag_14_lag_30,lag30_new_vaccinations_lag_1_lag_7_lag_14_lag_30,lag30_new_vaccinations_smoothed_lag_1_lag_7_lag_14_lag_30,lag30_new_people_vaccinated_smoothed_per_hundred_lag_1_lag_7_lag_14_lag_30,lag30_new_people_vaccinated_smoothed_lag_1_lag_7_lag_14_lag_30,lag30_new_vaccinations_smoothed_per_million_lag_1_lag_7_lag_14_lag_30,lag30_new_cases_Lag_7_lag_1_lag_7_lag_14_lag_30,lag30_new_cases_Lag_14_lag_1_lag_7_lag_14_lag_30,lag30_Rolling_Mean_new_cases_lag_1_lag_7_lag_14_lag_30,lag30_Weekend_lag_1_lag_7_lag_14_lag_30
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-22,0.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-23,0.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-24,1.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-25,0.0,0.00,1.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
2020-01-26,3.0,0.00,0.0,0.000,0.0,0.0,0.000,0.000,0.0,0.0,...,0.0,0.0,0.0,0.000,0.0,0.0,0.0,0.0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,509081.0,47.69,176889.0,2460.174,197.0,819029.0,0.592,2144.998,961271.0,714102068.0,...,405666611.0,1020797.0,919374.0,0.070,233279.0,2769.0,120936.0,136583.0,108498.571429,False
2021-12-28,356797.0,50.46,509081.0,2465.857,1892.0,820921.0,5.683,2150.086,1694071.0,715796139.0,...,406664025.0,997414.0,897532.0,0.069,227461.0,2703.0,111325.0,127558.0,106633.500000,False
2021-12-29,499452.0,50.46,356797.0,2473.078,2404.0,823325.0,7.221,2156.937,2280780.0,718076919.0,...,407796808.0,1132783.0,874415.0,0.067,222844.0,2634.0,154598.0,176534.0,103431.928571,False
2021-12-30,589431.0,50.46,499452.0,2480.041,2318.0,825643.0,6.963,2163.853,2302224.0,720379143.0,...,408329511.0,532703.0,861123.0,0.066,219081.0,2594.0,46992.0,58974.0,101911.857143,True


In [8]:
# For training data
X_train = USA_train_imputed_lagged.drop(columns=['new_cases'])
y_train = USA_train_imputed_lagged['new_cases']

# For testing data
X_test = USA_test_imputed_lagged.drop(columns=['new_cases'])
y_test = USA_test_imputed_lagged['new_cases']

In [9]:
!pip install --upgrade scikit-learn

Requirement already up-to-date: scikit-learn in c:\users\ryanc\anaconda3\lib\site-packages (1.3.2)


In [10]:
!pip install numpy==1.17.3



In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter distributions
param_dist = {
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'n_estimators': [50, 100, 200, 300, 500]
}

# Create XGBoost model
xgb_model = xgb.XGBRegressor()

# Instantiate RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=10, scoring='neg_mean_squared_error', cv=5, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_

# Train the final model with the best hyperparameters
final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ryanc\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 761, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 286, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\sklearn.py", line 775, in <lambda>
    create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 506, in inner_f
    return f(**kwargs)
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\core.py", line 616, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 707, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 297, in _from_pandas_df
    data, feature_names, feature_types = _transform_pandas_df(
  File "C:\Users\ryanc\anaconda3\lib\site-packages\xgboost\data.py", line 242, in _transform_pandas_df
    raise ValueError(msg + ', '.join(bad_fields))
ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
                categorical type is supplied, DMatrix parameter `enable_categorical` must
                be set to `True`.lag1_iso_code_lag_1, lag1_location_lag_1, lag1_continent_lag_1, lag1_Weekend_lag_1, lag1_iso_code_lag_7, lag1_location_lag_7, lag1_continent_lag_7, lag1_Weekend_lag_7, lag1_iso_code_lag_1_lag_7, lag1_location_lag_1_lag_7, lag1_continent_lag_1_lag_7, lag1_Weekend_lag_1_lag_7, lag1_iso_code_lag_14, lag1_location_lag_14, lag1_continent_lag_14, lag1_Weekend_lag_14, lag1_iso_code_lag_1_lag_14, lag1_location_lag_1_lag_14, lag1_continent_lag_1_lag_14, lag1_Weekend_lag_1_lag_14, lag1_iso_code_lag_7_lag_14, lag1_location_lag_7_lag_14, lag1_continent_lag_7_lag_14, lag1_Weekend_lag_7_lag_14, lag1_iso_code_lag_1_lag_7_lag_14, lag1_location_lag_1_lag_7_lag_14, lag1_continent_lag_1_lag_7_lag_14, lag1_Weekend_lag_1_lag_7_lag_14, lag1_iso_code_lag_30, lag1_location_lag_30, lag1_continent_lag_30, lag1_Weekend_lag_30, lag1_iso_code_lag_1_lag_30, lag1_location_lag_1_lag_30, lag1_continent_lag_1_lag_30, lag1_Weekend_lag_1_lag_30, lag1_iso_code_lag_7_lag_30, lag1_location_lag_7_lag_30, lag1_continent_lag_7_lag_30, lag1_Weekend_lag_7_lag_30, lag1_iso_code_lag_1_lag_7_lag_30, lag1_location_lag_1_lag_7_lag_30, lag1_continent_lag_1_lag_7_lag_30, lag1_Weekend_lag_1_lag_7_lag_30, lag1_iso_code_lag_14_lag_30, lag1_location_lag_14_lag_30, lag1_continent_lag_14_lag_30, lag1_Weekend_lag_14_lag_30, lag1_iso_code_lag_1_lag_14_lag_30, lag1_location_lag_1_lag_14_lag_30, lag1_continent_lag_1_lag_14_lag_30, lag1_Weekend_lag_1_lag_14_lag_30, lag1_iso_code_lag_7_lag_14_lag_30, lag1_location_lag_7_lag_14_lag_30, lag1_continent_lag_7_lag_14_lag_30, lag1_Weekend_lag_7_lag_14_lag_30, lag1_iso_code_lag_1_lag_7_lag_14_lag_30, lag1_location_lag_1_lag_7_lag_14_lag_30, lag1_continent_lag_1_lag_7_lag_14_lag_30, lag1_Weekend_lag_1_lag_7_lag_14_lag_30, lag7_iso_code_lag_1, lag7_location_lag_1, lag7_continent_lag_1, lag7_Weekend_lag_1, lag7_iso_code_lag_7, lag7_location_lag_7, lag7_continent_lag_7, lag7_Weekend_lag_7, lag7_iso_code_lag_1_lag_7, lag7_location_lag_1_lag_7, lag7_continent_lag_1_lag_7, lag7_Weekend_lag_1_lag_7, lag7_iso_code_lag_14, lag7_location_lag_14, lag7_continent_lag_14, lag7_Weekend_lag_14, lag7_iso_code_lag_1_lag_14, lag7_location_lag_1_lag_14, lag7_continent_lag_1_lag_14, lag7_Weekend_lag_1_lag_14, lag7_iso_code_lag_7_lag_14, lag7_location_lag_7_lag_14, lag7_continent_lag_7_lag_14, lag7_Weekend_lag_7_lag_14, lag7_iso_code_lag_1_lag_7_lag_14, lag7_location_lag_1_lag_7_lag_14, lag7_continent_lag_1_lag_7_lag_14, lag7_Weekend_lag_1_lag_7_lag_14, lag7_iso_code_lag_30, lag7_location_lag_30, lag7_continent_lag_30, lag7_Weekend_lag_30, lag7_iso_code_lag_1_lag_30, lag7_location_lag_1_lag_30, lag7_continent_lag_1_lag_30, lag7_Weekend_lag_1_lag_30, lag7_iso_code_lag_7_lag_30, lag7_location_lag_7_lag_30, lag7_continent_lag_7_lag_30, lag7_Weekend_lag_7_lag_30, lag7_iso_code_lag_1_lag_7_lag_30, lag7_location_lag_1_lag_7_lag_30, lag7_continent_lag_1_lag_7_lag_30, lag7_Weekend_lag_1_lag_7_lag_30, lag7_iso_code_lag_14_lag_30, lag7_location_lag_14_lag_30, lag7_continent_lag_14_lag_30, lag7_Weekend_lag_14_lag_30, lag7_iso_code_lag_1_lag_14_lag_30, lag7_location_lag_1_lag_14_lag_30, lag7_continent_lag_1_lag_14_lag_30, lag7_Weekend_lag_1_lag_14_lag_30, lag7_iso_code_lag_7_lag_14_lag_30, lag7_location_lag_7_lag_14_lag_30, lag7_continent_lag_7_lag_14_lag_30, lag7_Weekend_lag_7_lag_14_lag_30, lag7_iso_code_lag_1_lag_7_lag_14_lag_30, lag7_location_lag_1_lag_7_lag_14_lag_30, lag7_continent_lag_1_lag_7_lag_14_lag_30, lag7_Weekend_lag_1_lag_7_lag_14_lag_30, lag14_iso_code_lag_1, lag14_location_lag_1, lag14_continent_lag_1, lag14_Weekend_lag_1, lag14_iso_code_lag_7, lag14_location_lag_7, lag14_continent_lag_7, lag14_Weekend_lag_7, lag14_iso_code_lag_1_lag_7, lag14_location_lag_1_lag_7, lag14_continent_lag_1_lag_7, lag14_Weekend_lag_1_lag_7, lag14_iso_code_lag_14, lag14_location_lag_14, lag14_continent_lag_14, lag14_Weekend_lag_14, lag14_iso_code_lag_1_lag_14, lag14_location_lag_1_lag_14, lag14_continent_lag_1_lag_14, lag14_Weekend_lag_1_lag_14, lag14_iso_code_lag_7_lag_14, lag14_location_lag_7_lag_14, lag14_continent_lag_7_lag_14, lag14_Weekend_lag_7_lag_14, lag14_iso_code_lag_1_lag_7_lag_14, lag14_location_lag_1_lag_7_lag_14, lag14_continent_lag_1_lag_7_lag_14, lag14_Weekend_lag_1_lag_7_lag_14, lag14_iso_code_lag_30, lag14_location_lag_30, lag14_continent_lag_30, lag14_Weekend_lag_30, lag14_iso_code_lag_1_lag_30, lag14_location_lag_1_lag_30, lag14_continent_lag_1_lag_30, lag14_Weekend_lag_1_lag_30, lag14_iso_code_lag_7_lag_30, lag14_location_lag_7_lag_30, lag14_continent_lag_7_lag_30, lag14_Weekend_lag_7_lag_30, lag14_iso_code_lag_1_lag_7_lag_30, lag14_location_lag_1_lag_7_lag_30, lag14_continent_lag_1_lag_7_lag_30, lag14_Weekend_lag_1_lag_7_lag_30, lag14_iso_code_lag_14_lag_30, lag14_location_lag_14_lag_30, lag14_continent_lag_14_lag_30, lag14_Weekend_lag_14_lag_30, lag14_iso_code_lag_1_lag_14_lag_30, lag14_location_lag_1_lag_14_lag_30, lag14_continent_lag_1_lag_14_lag_30, lag14_Weekend_lag_1_lag_14_lag_30, lag14_iso_code_lag_7_lag_14_lag_30, lag14_location_lag_7_lag_14_lag_30, lag14_continent_lag_7_lag_14_lag_30, lag14_Weekend_lag_7_lag_14_lag_30, lag14_iso_code_lag_1_lag_7_lag_14_lag_30, lag14_location_lag_1_lag_7_lag_14_lag_30, lag14_continent_lag_1_lag_7_lag_14_lag_30, lag14_Weekend_lag_1_lag_7_lag_14_lag_30, lag30_iso_code_lag_1, lag30_location_lag_1, lag30_continent_lag_1, lag30_Weekend_lag_1, lag30_iso_code_lag_7, lag30_location_lag_7, lag30_continent_lag_7, lag30_Weekend_lag_7, lag30_iso_code_lag_1_lag_7, lag30_location_lag_1_lag_7, lag30_continent_lag_1_lag_7, lag30_Weekend_lag_1_lag_7, lag30_iso_code_lag_14, lag30_location_lag_14, lag30_continent_lag_14, lag30_Weekend_lag_14, lag30_iso_code_lag_1_lag_14, lag30_location_lag_1_lag_14, lag30_continent_lag_1_lag_14, lag30_Weekend_lag_1_lag_14, lag30_iso_code_lag_7_lag_14, lag30_location_lag_7_lag_14, lag30_continent_lag_7_lag_14, lag30_Weekend_lag_7_lag_14, lag30_iso_code_lag_1_lag_7_lag_14, lag30_location_lag_1_lag_7_lag_14, lag30_continent_lag_1_lag_7_lag_14, lag30_Weekend_lag_1_lag_7_lag_14, lag30_iso_code_lag_30, lag30_location_lag_30, lag30_continent_lag_30, lag30_Weekend_lag_30, lag30_iso_code_lag_1_lag_30, lag30_location_lag_1_lag_30, lag30_continent_lag_1_lag_30, lag30_Weekend_lag_1_lag_30, lag30_iso_code_lag_7_lag_30, lag30_location_lag_7_lag_30, lag30_continent_lag_7_lag_30, lag30_Weekend_lag_7_lag_30, lag30_iso_code_lag_1_lag_7_lag_30, lag30_location_lag_1_lag_7_lag_30, lag30_continent_lag_1_lag_7_lag_30, lag30_Weekend_lag_1_lag_7_lag_30, lag30_iso_code_lag_14_lag_30, lag30_location_lag_14_lag_30, lag30_continent_lag_14_lag_30, lag30_Weekend_lag_14_lag_30, lag30_iso_code_lag_1_lag_14_lag_30, lag30_location_lag_1_lag_14_lag_30, lag30_continent_lag_1_lag_14_lag_30, lag30_Weekend_lag_1_lag_14_lag_30, lag30_iso_code_lag_7_lag_14_lag_30, lag30_location_lag_7_lag_14_lag_30, lag30_continent_lag_7_lag_14_lag_30, lag30_Weekend_lag_7_lag_14_lag_30, lag30_iso_code_lag_1_lag_7_lag_14_lag_30, lag30_location_lag_1_lag_7_lag_14_lag_30, lag30_continent_lag_1_lag_7_lag_14_lag_30, lag30_Weekend_lag_1_lag_7_lag_14_lag_30
