In [120]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit


data = pd.read_csv('variable_selected_county_dataset.csv')  


data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)


lags = [7, 14, 21]
for lag in lags:
    data[f'lag{lag}incremental_cases'] = data['incremental_cases'].shift(lag)  




categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)


split_date = '2020-10-25'
train = data[:split_date]
test = data[split_date:]

X_train = train.drop(['incremental_cases'], axis=1)  
y_train = train['incremental_cases']  
X_test = test.drop(['incremental_cases'], axis=1)  
y_test = test['incremental_cases']  


In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor


param_dist = {
    'n_estimators': np.arange(100, 2001, 100),  # Wider range
    'learning_rate': np.linspace(0.01, 0.6, 30),  # More options
    'max_depth': np.arange(3, 20, 1),  # Wider range
    'subsample': np.linspace(0.5, 1.0, 20),  # More options
    'colsample_bytree': np.linspace(0.4, 1.0, 20),  # More options
    'min_child_weight': np.arange(1, 13, 1),  # Wider range
    'gamma': np.linspace(0, 5, 20),  # Including gamma
    'reg_lambda': np.linspace(0, 10, 20),  # Including reg_lambda
}
from sklearn.metrics import mean_squared_error


xgb_model = XGBRegressor(objective='reg:squarederror')


tscv = TimeSeriesSplit(n_splits=3)


random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=200,  
    scoring='neg_mean_squared_error',
    cv=tscv,
    random_state=42,
    n_jobs=-1
)


random_search.fit(X_train, y_train)


In [None]:

best_params = random_search.best_params_
print("Best Hyperparameters:", best_params)

best_score = np.sqrt(-random_search.best_score_)
print("Best RMSE:", best_score)


In [None]:

final_model = XGBRegressor(**best_params)
final_model.fit(X_train, y_train)


predictions_train = final_model.predict(X_train)
predictions_test = final_model.predict(X_test)


train_mse = mean_squared_error(y_train, predictions_train)
test_mse = mean_squared_error(y_test, predictions_test)

print(f"Train MSE: {train_mse}")
print(f"Test MSE: {test_mse}")


In [None]:
train_pred=pd.DataFrame(final_model.predict(X_train), columns=['yhat'], index=train.index)
train_pred.head()

In [None]:
pd.concat([y_train,train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['train', 'train_pred'])

In [None]:
test_pred=pd.DataFrame(final_model.predict(X_test), columns=['yhat'], index=test.index)
pd.concat([y_test,test_pred['yhat']], axis=1).plot( )
plt.legend(['test', 'test_pred'])

In [None]:
[column for column in data.columns if 'entity' in column]

In [None]:
data = pd.read_csv('variable_selected_county_dataset.csv') 

data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)


lags = [7, 14, 21]
for lag in lags:
    data[f'lag{lag}incremental_cases'] = data['incremental_cases'].shift(lag)  


categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
data = data[data['entity_New York-New York City'] == 1]



In [None]:

split_date = '2020-10-25'
train = data[:split_date]
test = data[split_date:]

X_train = train.drop(['incremental_cases'], axis=1)  
y_train = train['incremental_cases']  
X_test = test.drop(['incremental_cases'], axis=1)  
y_test = test['incremental_cases']  


In [None]:
train_pred=pd.DataFrame(final_model.predict(X_train), columns=['yhat'], index=train.index)
train_pred.head()
pd.concat([y_train,train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['train', 'train_pred'])

In [None]:
test_pred=pd.DataFrame(final_model.predict(X_test), columns=['yhat'], index=test.index)
pd.concat([y_test,test_pred['yhat']], axis=1).plot( )
plt.legend(['test', 'test_pred'])

In [None]:
calculate_mape(y_test, test_pred)

In [None]:
data = pd.read_csv('variable_selected_county_dataset.csv')  


data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)


lags = [7, 14, 21]
for lag in lags:
    data[f'lag{lag}incremental_cases'] = data['incremental_cases'].shift(lag)  


categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
data = data[data['entity_Illinois-Cook'] == 1]



In [None]:

split_date = '2020-10-25'
train = data[:split_date]
test = data[split_date:]

X_train = train.drop(['incremental_cases'], axis=1) 
y_train = train['incremental_cases']  
X_test = test.drop(['incremental_cases'], axis=1)  
y_test = test['incremental_cases']  

In [None]:
train_pred=pd.DataFrame(final_model.predict(X_train), columns=['yhat'], index=train.index)
train_pred.head()
pd.concat([y_train,train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['train', 'train_pred'])

In [None]:
test_pred=pd.DataFrame(final_model.predict(X_test), columns=['yhat'], index=test.index)
pd.concat([y_test,test_pred['yhat']], axis=1).plot( )
plt.legend(['test', 'test_pred'])

In [None]:
calculate_mape(y_test, test_pred)


In [None]:
def calculate_mape(y_true, y_pred):
    """
    Calculate the mean absolute percentage error (MAPE).

    Args:
        y_true (array-like): Actual values.
        y_pred (array-like): Predicted values.

    Returns:
        float: MAPE value.
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_indices = y_true != 0  
    mape = np.mean(np.abs((y_true[non_zero_indices] - y_pred[non_zero_indices]) / y_true[non_zero_indices])) * 100
    return mape


In [None]:
data = pd.read_csv('variable_selected_county_dataset.csv') 

data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)

lags = [7, 14, 21]
for lag in lags:
    data[f'lag{lag}incremental_cases'] = data['incremental_cases'].shift(lag)  
categorical_columns = data.select_dtypes(include=['object']).columns
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
data = data[data['entity_California-Los Angeles'] == 1]



In [None]:
split_date = '2020-10-25'
train = data[:split_date]
test = data[split_date:]

X_train = train.drop(['incremental_cases'], axis=1)  
y_train = train['incremental_cases']  
X_test = test.drop(['incremental_cases'], axis=1)  
y_test = test['incremental_cases']  


In [None]:
train_pred=pd.DataFrame(final_model.predict(X_train), columns=['yhat'], index=train.index)
train_pred.head()
pd.concat([y_train,train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['train', 'train_pred'])

In [None]:
test_pred=pd.DataFrame(final_model.predict(X_test), columns=['yhat'], index=test.index)
pd.concat([y_test,test_pred['yhat']], axis=1).plot( )
plt.legend(['test', 'test_pred'])

In [None]:
calculate_mape(y_test, test_pred)

In [None]:
train_pred=pd.DataFrame(, columns=['yhat'], index=train.index)
train_pred.head()
pd.concat([y_train,train_pred['yhat']], axis=1).plot(figsize=(15,5)) 
plt.legend(['train', 'train_pred'])

In [None]:
test_pred=pd.DataFrame(XGBRegressor(n_jobs = -1, random_state = 42).predict(X_test), columns=['yhat'], index=test.index)
pd.concat([y_test,test_pred['yhat']], axis=1).plot( )
plt.legend(['test', 'test_pred'])