In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

pd.set_option('display.max_rows', None)

## Import Summary Results

In [2]:
# import summary results

files = ["elastic_net_fs_search.csv",
         "elastic_net_fs_search_poly_2.csv",
         "elastic_net_fs_search_poly_3.csv",
         "lasso_fs_search.csv",
         "lasso_fs_search_2.csv",
         "lasso_fs_search_3.csv",
         "ridge_fs_search.csv",
         "ridge_fs_search_poly_1.csv",
         "ridge_fs_search_poly_2.csv",
         "ridge_fs_search_poly_3.csv",
         "SVR_summary_all.csv",
         "rf_summary.csv"]

df = pd.DataFrame()

for file in files:
    
    new_df = pd.read_csv(file)
    
    if file[0:11]=="elastic_net":
        new_df["model"] = "elastic net"
        if file[-5].isnumeric():
            new_df["degree"] = file[-5]
    elif file[0:5]=="lasso":
        new_df["model"] = "lasso"
        if file[-5].isnumeric():
            new_df["degree"] = file[-5]
    elif file[0:5]=="ridge":
        new_df["model"] = "ridge"
        if file[-5].isnumeric():
            new_df["degree"] = file[-5]
    elif file[0:2]=="rf":
        new_df["model"] = "random forest"
        
    df = pd.concat([df, new_df], axis=0)

# reorder columns into a slightly more helpful order
df = df.drop(columns = ["Unnamed: 0"])
cols = df.columns.tolist()
cols = [cols[5]] + [cols[2]] + [cols[4]] + [cols[3]] + cols[0:2] + cols[6:]
df = df[cols]
df.head()

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
0,elastic net,-1201.863136,-8.908714,0.914382,0.1,0.0,,,,,,,,
1,elastic net,-1196.972767,-8.861287,0.915117,0.1,0.1,,,,,,,,
2,elastic net,-1192.719479,-8.810556,0.915888,0.1,0.2,,,,,,,,
3,elastic net,-1189.654017,-8.758637,0.916733,0.1,0.3,,,,,,,,
4,elastic net,-1189.142098,-8.706644,0.917567,0.1,0.4,,,,,,,,


In [3]:
def read_data(data, date_type=False):
    print(date_type)
    if date_type:
        df = pd.read_csv(data)
    else:
        #parser = lambda date: pd.datetime.strptime(date, '%d%b%Y')
        df = pd.read_csv(data, parse_dates=["date"], dtype={'fips': str})
    return df

X_train = read_data("../Data/Train-Test Set/X_train.csv").drop(columns="date")
X_test = read_data("../Data/Train-Test Set/X_test.csv").drop(columns="date")
y_train = read_data("../Data/Train-Test Set/y_train.csv", True)
y_test = read_data("../Data/Train-Test Set/y_test.csv", True)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

False
False
True
True
(71992, 15) (17998, 15) (71992, 1) (17998, 1)


## Determine Best Models

In [4]:
best_models = []

### Model with the best mean squared error performance

In [5]:
df.sort_values(by="neg_mean_squared_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
36,random forest,-902.19953,-7.58276,0.949379,,,,,,,mae,auto,,2.0


In [7]:
# add model to list
best_mse_model_overall = RandomForestRegressor(criterion = "mae", max_features="auto", 
                                               max_depth=None, min_samples_split=2, verbose=2, n_jobs=-1)

best_models.append(best_mse_model_overall.fit(X_train, np.array(y_train).ravel()))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 100building tree 2 of 100
building tree 3 of 100
building tree 4 of 100

building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 42.8min


building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73 of 100
building tree 74 of 100
building tree 75 of 100
building tree 76 of 100
building tree 77 of 100
building tree 78

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 115.8min finished


### Model with the best mean absolute error performance

In [8]:
# get best neg_mean_absolute_error model
df.sort_values(by="neg_mean_absolute_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
6,random forest,-974.920992,-7.521976,0.944697,,,,,,,mse,log2,,2.0


In [9]:
# add model to list
best_mae_model_overall = RandomForestRegressor(criterion = "mse", max_features="log2", 
                                               max_depth=None, min_samples_split=2, n_jobs=-1)

best_models.append(best_mae_model_overall.fit(X_train, np.array(y_train).ravel()))

### Model with the best r2 performance

In [10]:
# get best r2 model
df.sort_values(by="r2", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
36,random forest,-902.19953,-7.58276,0.949379,,,,,,,mae,auto,,2.0


In [11]:
best_r2_model_overall = RandomForestRegressor(criterion = "mae", max_features="auto", 
                                              max_depth=None, min_samples_split=2, n_jobs=-1)

best_models.append(best_r2_model_overall.fit(X_train, np.array(y_train).ravel()))

## Since best models were all random forests, let's just add in some best performers for other models

### Best Elastic Net/Ridge/Lasso Models by R2

In [12]:
# best elastic net, ridge, or lasso models
regs = df[(df["model"]=="elastic net") | (df["model"]=="ridge") | (df["model"]=="lasso")]
regs.sort_values(by="r2", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
10,ridge,-1438.062773,-8.335918,0.930791,1.0,,,,,,,,,


In [13]:
best_models.append(Ridge(alpha=1).fit(X_train, np.array(y_train).ravel()))

### Best Elastic Net/Ridge/Lasso Models by mean squared error

In [14]:
regs.sort_values(by="neg_mean_squared_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
5,elastic net,-1188.039293,-8.649833,0.918618,0.1,0.5,,,,,,,,


In [15]:
best_models.append(ElasticNet(alpha=0.1, l1_ratio=0.5).fit(X_train, np.array(y_train).ravel()))

### Best Elastic Net/Ridge/Lasso Models by mean absolute error

In [16]:
regs.sort_values(by="neg_mean_absolute_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
0,lasso,-3456.759656,-8.040768,0.895513,0.1,,2,,,,,,,


In [17]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2)
best_reg_model_mae = Lasso(alpha = 1)
X_poly = poly_features.fit_transform(X_train)
best_models.append(best_reg_model_mae.fit(X_poly, y_train))

  positive)


### Best SVR model by r2

In [18]:
# best svr or svrlinear models
from sklearn import svm

svrs = df[(df["model"]=="SVR") | (df["model"]=="LinearSVR")]
svrs.sort_values(by="r2", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
60,LinearSVR,-2735.567145,-8.502936,0.910665,,,,,0.1,0.0,,,,


In [19]:
best_models.append(svm.LinearSVR(C=0.1, epsilon = 0).fit(X_train, np.array(y_train).ravel()))

### Best SVR model by mse

In [20]:
svrs.sort_values(by="neg_mean_squared_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
60,LinearSVR,-2735.567145,-8.502936,0.910665,,,,,0.1,0.0,,,,


In [21]:
best_models.append(svm.LinearSVR(C=0.1, epsilon=0).fit(X_train, np.array(y_train).ravel()))

### Best SVR model by mae

In [22]:
svrs.sort_values(by="neg_mean_absolute_error", ascending=False).head(1)

Unnamed: 0,model,neg_mean_squared_error,neg_mean_absolute_error,r2,alpha,l1_ratio,degree,kernel,C,epsilon,rf__criterion,rf__max_features,rf__max_depth,rf__min_samples_split
72,LinearSVR,-4136.596882,-7.637496,0.8812,,,,,10.0,0.0,,,,


In [23]:
best_models.append(svm.LinearSVR(C=10, epsilon = 0).fit(X_train, np.array(y_train).ravel()))



## Evaluate best models on test data

In [24]:
best_models

[RandomForestRegressor(criterion='mae', n_jobs=-1, verbose=2),
 RandomForestRegressor(max_features='log2', n_jobs=-1),
 RandomForestRegressor(criterion='mae', n_jobs=-1),
 Ridge(alpha=1),
 ElasticNet(alpha=0.1),
 Lasso(alpha=1),
 LinearSVR(C=0.1, epsilon=0),
 LinearSVR(C=0.1, epsilon=0),
 LinearSVR(C=10, epsilon=0)]

In [46]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

ev = pd.DataFrame(columns = ["Model", "MAE", "MSE", "R2", "Bias", "Variance", "RSS"])

for model in best_models:
    print(model)
    
    try:
        model.predict(X_test)
    except:
        poly_features = PolynomialFeatures(degree=2)
        model_test_X =  poly_features.fit_transform(X_test)
        model_train_X = poly_features.fit_transform(X_train)
    else:
        model_test_X = X_test
        model_train_X = X_train
    
    predictions = model.predict(model_test_X)
        
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(predictions, y_test)
    bias = mean_squared_error(model.predict(model_train_X),np.array(y_train))
    rss = np.sum((predictions-np.array(y_test))**2)
    variance = model.score(model_train_X, y_train)
    
    print("MAE: ", mae)
    print("MSE: ", mse)
    print("Bias:", bias)
    print("RSS:", rss)
    print("Variance:", variance)
    print("R-squared:", r2)
    
    ev = ev.append({"Model": str(model), 
                        "MAE": mae,
                        "MSE": mse,
                        "R2": r2,
                        "Bias": bias,
                        "Variance": variance,
                        "RSS": rss},
                        ignore_index = True)

    
    print("\n")
    
ev

RandomForestRegressor(criterion='mae', n_jobs=-1, verbose=2)


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    1.0s finished


MAE:  3.4395352261362375
MSE:  151.31379286448492
Bias: 28.005348365443382
RSS: 7070169278824.893
Variance: 0.9983280089520227
R-squared: 0.9860034233148023


RandomForestRegressor(max_features='log2', n_jobs=-1)
MAE:  3.2644205513575586
MSE:  133.1907369536771
Bias: 29.927885876837497
RSS: 7057432259727.804
Variance: 0.9982132285369925
R-squared: 0.9876348441492375


RandomForestRegressor(criterion='mae', n_jobs=-1)
MAE:  3.4483098122013556
MSE:  150.13919597733084
Bias: 26.84512951821036
RSS: 7069672953412.513
Variance: 0.9983972769897187
R-squared: 0.9861101092311761


Ridge(alpha=1)
MAE:  6.87074537029446
MSE:  580.5369190442858
Bias: 787.2850081891063
RSS: 6926396948917.307
Variance: 0.9529970679628048
R-squared: 0.9440009082746814


ElasticNet(alpha=0.1)
MAE:  7.628666559816443
MSE:  671.0196120810443
Bias: 910.2198050063205
RSS: 6874424930515.0205
Variance: 0.9456575456301022
R-squared: 0.9342551984116745


Lasso(alpha=1)
MAE:  6.655139470486812
MSE:  363.33751926134255
Bias: 43

Unnamed: 0,Model,MAE,MSE,R2,Bias,Variance,RSS
0,"RandomForestRegressor(criterion='mae', n_jobs=...",3.439535,151.313793,0.986003,28.005348,0.998328,7070169000000.0
1,"RandomForestRegressor(max_features='log2', n_j...",3.264421,133.190737,0.987635,29.927886,0.998213,7057432000000.0
2,"RandomForestRegressor(criterion='mae', n_jobs=-1)",3.44831,150.139196,0.98611,26.84513,0.998397,7069673000000.0
3,Ridge(alpha=1),6.870745,580.536919,0.944001,787.285008,0.952997,6926397000000.0
4,ElasticNet(alpha=0.1),7.628667,671.019612,0.934255,910.219805,0.945658,6874425000000.0
5,Lasso(alpha=1),6.655139,363.337519,0.964089,438.527049,0.973819,6845687000000.0
6,"LinearSVR(C=0.1, epsilon=0)",6.259563,634.618763,0.929387,952.810836,0.943115,6480377000000.0
7,"LinearSVR(C=0.1, epsilon=0)",6.258521,634.816966,0.929281,953.740204,0.943059,6476959000000.0
8,"LinearSVR(C=10, epsilon=0)",6.134381,587.378296,0.93724,868.715776,0.948135,6600468000000.0


In [47]:
ev.to_csv("Best_Models_Scores.csv")

In [48]:
ev.sort_values(by="MAE")

Unnamed: 0,Model,MAE,MSE,R2,Bias,Variance,RSS
1,"RandomForestRegressor(max_features='log2', n_j...",3.264421,133.190737,0.987635,29.927886,0.998213,7057432000000.0
0,"RandomForestRegressor(criterion='mae', n_jobs=...",3.439535,151.313793,0.986003,28.005348,0.998328,7070169000000.0
2,"RandomForestRegressor(criterion='mae', n_jobs=-1)",3.44831,150.139196,0.98611,26.84513,0.998397,7069673000000.0
8,"LinearSVR(C=10, epsilon=0)",6.134381,587.378296,0.93724,868.715776,0.948135,6600468000000.0
7,"LinearSVR(C=0.1, epsilon=0)",6.258521,634.816966,0.929281,953.740204,0.943059,6476959000000.0
6,"LinearSVR(C=0.1, epsilon=0)",6.259563,634.618763,0.929387,952.810836,0.943115,6480377000000.0
5,Lasso(alpha=1),6.655139,363.337519,0.964089,438.527049,0.973819,6845687000000.0
3,Ridge(alpha=1),6.870745,580.536919,0.944001,787.285008,0.952997,6926397000000.0
4,ElasticNet(alpha=0.1),7.628667,671.019612,0.934255,910.219805,0.945658,6874425000000.0


The best model is the RandomForestRegressor with max_features="log2"!

## Get feature importance of best model

In [50]:
best_models[1]

RandomForestRegressor(max_features='log2', n_jobs=-1)

In [39]:
feature_importance = pd.DataFrame({"features": X_train.columns.to_list(), 
                                   "importance": best_models[1].feature_importances_ })

feature_importance.sort_values(by="importance", ascending = False)

Unnamed: 0,features,importance
0,new_cases_7avg,0.181794
2,new_cases,0.165428
9,prev_day_adult_admit_7daysum,0.163725
14,prev_day_adult_admit_50-59_7daysum,0.133459
12,prev_day_adult_admit_80+_7daysum,0.102326
1,2weeksago_cases_7avg,0.074988
6,cumulative_cases,0.059072
5,age_35_44,0.033101
8,other_race,0.025334
4,white,0.020545
