In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import cross_validate


from sklearn import metrics
import glob, os
import tqdm

import warnings

warnings.filterwarnings('ignore')

In [2]:
def mape(y_true, y_predict):
  # Note this blows up if y_true = 0
  # Ignore for demo -- in some sense an unsolvable
  # problem with MAPE as an error metric
  y_true = np.array(y_true)
  y_predict = np.array(y_predict)
  return np.abs((y_true - y_predict)/y_true).mean()

def rmse(y_true, y_predict):
  # Note this blows up if y_true = 0
  # Ignore for demo -- in some sense an unsolvable
  # problem with MAPE as an error metric
  y_true = np.array(y_true)
  y_predict = np.array(y_predict)
  return np.sqrt(metrics.mean_squared_error(y_true, y_predict))




In [3]:
def ComputeRegressions(odf, types, model):
    scaler = MinMaxScaler(feature_range=(0,1))
    cv = KFold(n_splits=10, random_state=1, shuffle=True)
    #model = GradientBoostingRegressor(max_depth=3, random_state=123, max_features = 'sqrt', subsample = 0.8, min_samples_split = 100)
    #model = SVR(kernel='sigmoid')
    mape_scorer = make_scorer(mape, greater_is_better=True)
    rmse_scorer = make_scorer(rmse, greater_is_better=True)
    result_dict = {}
    for key, values in types.items():
        X,y = odf[values].values, odf['target'].values
        X = scaler.fit_transform(X)
        y = minmax_scale(y)
        y = y + 0.01
        mape_scores = cross_val_score(model, X, y, scoring=mape_scorer, cv=cv, n_jobs=-1)
        rmse_scores = cross_val_score(model, X, y, scoring=rmse_scorer, cv=cv, n_jobs=-1)

        result_dict[key+'_mape'] = mape_scores.mean()
        result_dict[key+'_rmse'] = rmse_scores.mean()
    return result_dict
        


In [4]:
ndf = {}
k = 0
#modelArray = [LinearRegression(), SVR(kernel='rbf'), SVR(kernel='poly'), SVR(kernel='linear'), RandomForestRegressor(max_depth=3), GradientBoostingRegressor(max_depth=3, random_state=123, max_features = 'sqrt', subsample = 0.8)]
#modelName = ['Linear Regression','SVM (RBF)', 'SVM (Poly)','SVM(Linear)','Random Forest','Gradient Boost']

modelArray = [LinearRegression(n_jobs=-1), 
              RandomForestRegressor(max_depth=3,n_jobs=-1), 
              GradientBoostingRegressor(max_depth=3, random_state=123, max_features = 'sqrt', subsample = 0.8),
              SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1),
              MLPRegressor(hidden_layer_sizes=(3), activation='tanh', solver='lbfgs', max_iter=1000)
             ]
modelName = ['Linear Regression','Random Forest','Gradient Boost', 'SVM', 'MLP']
timestamps = [pd.Timestamp(2020, 3, 1),pd.Timestamp(2020, 7, 1),pd.Timestamp(2020, 10, 1)]
wave = ['First','Second','Third', 'All']
path = 'StateFiles'
with os.scandir(path) as entries:
    for entry in entries:
        j=0
        for model in modelArray:
            for i in range(4):
                row = {}
                fname = entry.name
                fips = fname.replace('CombinedDF_','').replace('.csv','')
                #print(fips, modelName[j], wave[i])
                df = pd.read_csv(path+'/'+fname)
                df['target'] = df['cases_pc'].shift(-14)
                df = df[df.notna()]
                df['date'] = pd.to_datetime(df['date']).dt.date
                if i==2:
                    df = df[df['date']>=timestamps[i]]
                elif i<2:
                    df = df[((df['date']>=timestamps[i]) & (df['date']<=timestamps[i+1]))]
                else:
                    df = df[df['date']>=timestamps[0]]
                odf = df[df.columns[1:]].fillna(0)
                odf = odf[odf['target']!=0]
                #columns_ext = ['Ext_risk_c', 'cases_pc']
                #columns_int = ['cases_pc']
                columns_ext_int = ['Int_risk_c','Ext_risk_c', 'cases_pc']
                columns_ext = ['Ext_risk_c', 'cases_pc']
                columns_int = ['Int_risk_c','cases_pc']
                columns_cas = ['cases_pc']
                types = {'both':columns_ext_int, 'ext_only':columns_ext, 'int_only':columns_int, 'none':columns_cas}
                row['fips'] = fips
                result_dict = ComputeRegressions(odf, types, model)
                row['Model'] = modelName[j]
                row['Wave'] = wave[i]
                for key, values in result_dict.items():
                    row[key] = values
                ndf[k] = row
                k = k + 1
            j = j + 1


In [5]:
cdf = pd.DataFrame.from_dict(ndf, 'index')
cdf.to_csv('completeAnalysis_Cases_INTEXT_10192021.csv', index=False)

In [6]:
cdf_consolidated = cdf.groupby(['Model','Wave']).agg('mean').reset_index()
cdf_consolidated.to_csv('CompleteAnalysis_Waves_Cases_INTEXT_10192021.csv', index=False)