# Feature Selection

## Import Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# From: https://towardsdatascience.com/time-based-cross-validation-d259b13d42b8
import timesplit as ts

# import our pipeline file [TO DO: NEEDS TO BE UPDATED TO USE OURS]
import pipeline as pline

## Import cleaned data on a Fips-date level

In [2]:
df = pd.read_csv("../Data/merged.csv", parse_dates=["date"], dtype={'fips': str})
df.head()

Unnamed: 0,state,fips,county,date,cumulative_cases,cumulative_deaths,new_cases,new_deaths,new_cases_7avg,new_deaths_7avg,...,mask_mandate,retail_rec,grocery_pharm,parks,transit,workplace,residential,new doses,cumulative doses,cases_next_week
0,IL,17001,Adams,2020-03-20,1.0,0.0,1.0,0.0,1.0,0.0,...,,-33.0,11.0,,-7.0,-22.0,13.0,0.0,0.0,0.0
1,IL,17001,Adams,2020-03-21,1.0,0.0,0.0,0.0,0.0,0.0,...,,-55.0,-14.0,,-30.0,-15.0,,0.0,0.0,0.0
2,IL,17001,Adams,2020-03-22,1.0,0.0,0.0,0.0,0.0,0.0,...,,-63.0,-42.0,,,-25.0,,0.0,0.0,0.0
3,IL,17001,Adams,2020-03-23,1.0,0.0,0.0,0.0,0.0,0.0,...,,-49.0,-20.0,,-21.0,-29.0,14.0,0.0,0.0,0.0
4,IL,17001,Adams,2020-03-24,1.0,0.0,0.0,0.0,0.0,0.0,...,,-46.0,-20.0,,,-31.0,15.0,0.0,0.0,0.0


## Some gentle data processing to get rid of any remaining NAs

In [3]:
# filter down to dates with hospital
date_mask = (df["date"]>='7-31-2020')
df = df[date_mask]

# drop grocery_pharm, parks, transit because they have too much missingness
df.drop(columns=["grocery_pharm", "parks", "transit", "residential"], inplace = True)

# fill na's with mean for that state that date for retail_rec, workplace [TO DO: IS THIS REASONABLE?]
for var in ["retail_rec", "workplace"]:
    df[var] = df[["state", "date", var]].groupby(["state", "date"]).transform(lambda x: x.fillna(x.mean()))

# fill missing masks with 0 because the missings are from MO, who doesn't have a mask mandate
df["mask_mandate"] = df["mask_mandate"].fillna(0)

# drop na's (should just be the last week for each fips)
df = df.dropna()

df.describe(datetime_is_numeric=True)

Unnamed: 0,date,cumulative_cases,cumulative_deaths,new_cases,new_deaths,new_cases_7avg,new_deaths_7avg,2weeksago_cases_7avg,2weeksago_deaths_7avg,total_pop,...,prev_day_adult_admit_60-69_7daysum,prev_day_adult_admit_70-79_7daysum,prev_day_adult_admit_80+_7daysum,prev_day_adult_admit_unknown_7daysum,mask_mandate,retail_rec,workplace,new doses,cumulative doses,cases_next_week
count,89993,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,...,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0,89993.0
mean,2020-12-24 18:13:46.864363008,5934.58589,106.605697,30.572793,0.492066,30.632838,0.436312,30.749825,0.436234,101256.2,...,3.831109,4.095941,3.817341,0.387475,0.538409,-2.677255,-18.272608,236.060027,14752.09,30.517885
min,2020-07-31 00:00:00,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2027.0,...,0.0,0.0,0.0,0.0,0.0,-97.0,-90.0,0.0,0.0,0.0
25%,2020-10-12 00:00:00,463.0,7.0,1.0,0.0,2.0,0.0,2.0,0.0,14640.0,...,0.0,0.0,0.0,0.0,0.0,-11.0,-24.0,0.0,0.0,2.0
50%,2020-12-25 00:00:00,1463.0,26.0,6.0,0.0,7.0,0.0,7.0,0.0,32450.0,...,0.0,0.0,0.0,0.0,1.0,-3.0,-16.0,1.0,86.0,7.0
75%,2021-03-08 00:00:00,3880.0,70.0,20.0,0.0,21.0,0.0,21.0,0.0,66371.0,...,3.0,3.0,3.0,0.0,1.0,6.0,-10.0,79.0,6241.0,21.0
max,2021-05-24 00:00:00,549205.0,10745.0,6697.0,277.0,4654.0,70.0,4654.0,70.0,5198275.0,...,369.0,326.0,300.0,634.0,1.0,213.0,33.0,37855.0,2242133.0,4654.0
std,,23957.879479,491.785697,131.828853,3.143472,125.012901,2.14631,125.080547,2.145375,336693.1,...,13.680621,13.812705,13.029344,6.085959,0.498525,15.997305,12.975186,1130.896904,72432.9,124.942295


## Print out some descriptive statistics

In [None]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)

# from later down in the notebook
final_features = ['date',
 'new_cases_7avg',
 '2weeksago_cases_7avg',
 'new_cases',
 'age_45_54',
 'white',
 'age_35_44',
 'cumulative_cases',
 'asian',
 'other_race',
 'prev_day_adult_admit_7daysum',
 'p_white',
 'retail_rec',
 'prev_day_adult_admit_80+_7daysum',
 'p_age_65over',
 'prev_day_adult_admit_50-59_7daysum']

descriptive_stats = df[final_features].describe().transpose()

descriptive_stats.to_csv("Descriptive Statistics of Final Features.csv")

## Split Train Test

In [4]:
# split df into features and labels
X = df.drop(columns=["cases_next_week"])
y = df["cases_next_week"]

# split df into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

# df that's just training
df_train = pd.concat([X_train, y_train], axis=1)

## Pre-process data

In [5]:
# get list of numerical features for normalization
numerical_features = X_train.columns.tolist()[4:]
numerical_features.remove('mask_mandate')
numerical_features

# normalize numerical features
X_train, X_test = pline.normalize(X_train, X_test, numerical_features)

# one hot encode categorical variables
X_train = pline.one_hot_encode(X_train, ["state", "fips"])
X_test = pline.one_hot_encode(X_test, ["state", "fips"])

# drop county cause that's the same as fips
X_train = X_train.drop(columns=["county"])
X_test = X_test.drop(columns=["county"])

## Feature Selection

### Use Variance threshold to weed out features with zero variance
"This is one of the most simple approaches to feature selection. The scikit-learn library has a method called VarianceThreshold . This method takes a threshold value and when fitted to a feature set will remove any features below this threshold. The default value for the threshold is 0 and this will remove any features with zero variance, or in other words where all values are the same."


In [6]:
X = X_train
selector = VarianceThreshold()
print("Original feature shape:", X.iloc[:, 1:].shape)
new_X = selector.fit_transform(X.iloc[:, 1:])
print("Transformed feature shape:", new_X.shape)

Original feature shape: (71994, 387)
Transformed feature shape: (71994, 387)


Looks like there are no variables with zero variance, which is good!

### Narrow down to the top 50 features with SelectKBest

In [7]:
# get k highest scoring variables
fs = SelectKBest(score_func=mutual_info_regression, k="all")

# learn relationship from training data (drop non-numerical data)
fs.fit(X_train.drop(columns=["date"]), y_train.drop(columns=["date"]))

SelectKBest(k='all',
            score_func=<function mutual_info_regression at 0x7fae1a74c280>)

In [8]:
feature_scores = pd.DataFrame({'variables': X_train.drop(columns=["date"]).columns.tolist(), 
                               'score': fs.scores_})


In [9]:
# top 50 variables
top50 = feature_scores.sort_values(by="score", ascending=False).head(50)["variables"].tolist()

In [51]:
feature_scores.sort_values(by="score", ascending=False).head(50).reset_index()

Unnamed: 0,index,variables,score
0,4,new_cases_7avg,1.277417
1,6,2weeksago_cases_7avg,0.838105
2,2,new_cases,0.823374
3,8,total_pop,0.696498
4,32,white,0.69523
5,9,male,0.692669
6,11,female,0.692615
7,19,age_35_44,0.692159
8,46,housing_units,0.691549
9,54,below_500_pov,0.690535


In [10]:
X_train = X_train[["date"] + top50]
X_test = X_test[["date"] + top50]

In [11]:
X_train.columns

Index(['date', 'new_cases_7avg', '2weeksago_cases_7avg', 'new_cases',
       'total_pop', 'white', 'male', 'female', 'age_35_44', 'housing_units',
       'below_500_pov', 'age_under14', 'age_62over', 'age_45_54', 'age_55_59',
       'age_25_34', 'below_400_pov', 'age_15_19', 'age_20_24', 'age_60_64',
       'below_300_pov', 'age_65over', 'below_200_pov', 'below_185_pov',
       'below_125_pov', 'below_150_pov', 'below_pov', 'non_white',
       'female_below_pov', 'male_below_pov', 'below_50_pov', 'hispanic',
       'black', 'asian', 'cumulative_cases', 'total_adult_hospitalizations',
       'other_race', 'native', 'prev_day_adult_admit_7daysum', 'p_non_white',
       'p_white', 'cumulative_deaths', 'retail_rec',
       'prev_day_adult_admit_70-79_7daysum',
       'prev_day_adult_admit_80+_7daysum', 'p_age_62over', 'p_black',
       'prev_day_adult_admit_60-69_7daysum', 'age_median', 'p_age_65over',
       'prev_day_adult_admit_50-59_7daysum'],
      dtype='object')

In [12]:
X_train.shape

(71994, 51)

### Set up indices for time-based CV

In [13]:
# set up time-based CV indices
tscv = ts.TimeBasedCV(train_period=21,
                      test_period=7,
                      freq='days')

tscv_indices = []

for train_index, test_index in tscv.split(X_train, date_column='date'):
    tscv_indices.append((train_index, test_index))

### Perform some additional feature selection via Lasso regularization

In [14]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

In [15]:
sel_ = SelectFromModel(LassoCV(alphas=[0.2], cv=tscv_indices, max_iter=10000))
sel_.fit(X_train.drop(columns=["date"]), y_train.drop(columns=["date"]))

SelectFromModel(estimator=LassoCV(alphas=[0.2],
                                  cv=[([11, 23, 42, 44, 73, 74, 96, 106, 110,
                                        117, 118, 120, 131, 151, 157, 161, 176,
                                        209, 223, 240, 251, 255, 260, 269, 302,
                                        308, 318, 332, 356, 359, ...],
                                       [14, 33, 40, 49, 64, 225, 238, 270, 317,
                                        346, 390, 442, 501, 607, 637, 653, 674,
                                        712, 757, 763, 783, 789, 812, 948, 987,
                                        1026, 1076, 1126, 1203, 1272, ...]),
                                      ([11, 14, 23, 33, 40, 49, 64, 73, 74, 110,
                                        117, 118, 120, 151, 176...
                                        375, 483, 518, 524, 555, 586, 738, 765,
                                        799, 836, 887, 912, 916, 919, 1000,
                      

In [41]:
X_train.drop(columns=["date"]).columns[(sel_.get_support())]

Index(['new_cases_7avg', '2weeksago_cases_7avg', 'new_cases', 'white',
       'age_35_44', 'asian', 'cumulative_cases', 'other_race',
       'prev_day_adult_admit_7daysum', 'p_non_white', 'retail_rec',
       'prev_day_adult_admit_80+_7daysum', 'p_age_65over',
       'prev_day_adult_admit_50-59_7daysum'],
      dtype='object')

In [42]:
sel_.estimator_.coef_[sel_.get_support()]

array([109.99070159, -20.44589901,   9.99019268,   7.61177975,
         3.97482376,   3.5782663 , -15.28742283,   6.44537931,
        11.35418614,  -0.32638226,  -0.30196209,   3.40873046,
        -0.1871869 ,   6.08616485])

In [47]:
pd.DataFrame({'features':X_train.drop(columns=["date"]).columns[(sel_.get_support())], 'coefficients': sel_.estimator_.coef_[sel_.get_support()]}).sort_values(by='coefficients', ascending=False)

Unnamed: 0,features,coefficients
0,new_cases_7avg,109.990702
8,prev_day_adult_admit_7daysum,11.354186
2,new_cases,9.990193
3,white,7.61178
7,other_race,6.445379
13,prev_day_adult_admit_50-59_7daysum,6.086165
4,age_35_44,3.974824
5,asian,3.578266
11,prev_day_adult_admit_80+_7daysum,3.40873
12,p_age_65over,-0.187187


In [37]:
np.abs(sel_.estimator_.coef_)[np.abs(sel_.estimator_.coef_) > 0.1]

array([109.99070159,  20.44589901,   9.99019268,   7.61177975,
         3.97482376,   3.5782663 ,  15.28742283,   6.44537931,
        11.35418614,   0.32638226,   0.30196209,   3.40873046,
         0.1871869 ,   6.08616485])

In [16]:
selected_feat = X_train.drop(columns=["date"]).columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 51
selected features: 14
features with coefficients shrank to zero: 35


In [22]:
selected_feat

Index(['new_cases_7avg', '2weeksago_cases_7avg', 'new_cases', 'white',
       'age_35_44', 'asian', 'cumulative_cases', 'other_race',
       'prev_day_adult_admit_7daysum', 'p_non_white', 'retail_rec',
       'prev_day_adult_admit_80+_7daysum', 'p_age_65over',
       'prev_day_adult_admit_50-59_7daysum'],
      dtype='object')

In [17]:
final_features = ["date"] + selected_feat.tolist() 
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

In [18]:
final_features

['date',
 'new_cases_7avg',
 '2weeksago_cases_7avg',
 'new_cases',
 'white',
 'age_35_44',
 'asian',
 'cumulative_cases',
 'other_race',
 'prev_day_adult_admit_7daysum',
 'p_non_white',
 'retail_rec',
 'prev_day_adult_admit_80+_7daysum',
 'p_age_65over',
 'prev_day_adult_admit_50-59_7daysum']

In [54]:
df[(df["state"]=="IL") & (df["date"]=="5-1-2021")].head()

Unnamed: 0,state,fips,county,date,cumulative_cases,cumulative_deaths,new_cases,new_deaths,new_cases_7avg,new_deaths_7avg,...,prev_day_adult_admit_60-69_7daysum,prev_day_adult_admit_70-79_7daysum,prev_day_adult_admit_80+_7daysum,prev_day_adult_admit_unknown_7daysum,mask_mandate,retail_rec,workplace,new doses,cumulative doses,cases_next_week
407,IL,17001,Adams,2021-05-01,8325.0,148.0,3.0,0.0,8.0,0.0,...,4.0,3.0,0.0,0.0,0.0,-12.0,-6.0,4.0,43509.0,11.0
814,IL,17003,Alexander,2021-05-01,464.0,11.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.301887,-7.480519,1.0,1443.0,0.0
1239,IL,17005,Bond,2021-05-01,2018.0,30.0,2.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.301887,-10.0,6.0,9333.0,2.0
1657,IL,17007,Boone,2021-05-01,6582.0,83.0,14.0,0.0,15.0,0.0,...,0.0,0.0,0.0,0.0,0.0,19.0,-8.0,73.0,33870.0,10.0
2049,IL,17009,Brown,2021-05-01,696.0,12.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,13.301887,-7.480519,0.0,2796.0,1.0


## Export final train and test sets

In [31]:
X_train_final.to_csv("../Data/Train-Test Set/X_train.csv", index=False)
y_train.to_csv("../Data/Train-Test Set/y_train.csv", index=False)

X_test_final.to_csv("../Data/Train-Test Set/X_test.csv", index=False)
y_test.to_csv("../Data/Train-Test Set/y_test.csv", index=False)

In [30]:
y_train.head()

116423     6.0
102619    13.0
63714      8.0
95950     10.0
31204      3.0
Name: cases_next_week, dtype: float64