# Feature Selection

## Import Packages

In [83]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# import our pipeline file [TO DO: NEEDS TO BE UPDATED TO USE OURS]
import pipeline as pline

## Import cleaned data on a Fips-date level

In [84]:
df = pd.read_csv("../Data/merged.csv", parse_dates=["date"], dtype={'fips': str})
df.head()

Unnamed: 0,state,fips,county,date,cumulative_cases,cumulative_deaths,new_cases,new_deaths,new_cases_7avg,new_deaths_7avg,...,mask_mandate,retail_rec,grocery_pharm,parks,transit,workplace,residential,new doses,cumulative doses,cases_next_week
0,IL,17001,Adams,2020-03-20,1.0,0.0,1.0,0.0,1.0,0.0,...,,-33.0,11.0,,-7.0,-22.0,13.0,0.0,0.0,0.0
1,IL,17001,Adams,2020-03-21,1.0,0.0,0.0,0.0,0.0,0.0,...,,-55.0,-14.0,,-30.0,-15.0,,0.0,0.0,0.0
2,IL,17001,Adams,2020-03-22,1.0,0.0,0.0,0.0,0.0,0.0,...,,-63.0,-42.0,,,-25.0,,0.0,0.0,0.0
3,IL,17001,Adams,2020-03-23,1.0,0.0,0.0,0.0,0.0,0.0,...,,-49.0,-20.0,,-21.0,-29.0,14.0,0.0,0.0,0.0
4,IL,17001,Adams,2020-03-24,1.0,0.0,0.0,0.0,0.0,0.0,...,,-46.0,-20.0,,,-31.0,15.0,0.0,0.0,0.0


## Some gentle data processing to get rid of any remaining NAs

In [85]:
# filter down to dates with hospital
date_mask = (df["date"]>='7-31-2020')
df = df[date_mask]

# drop grocery_pharm, parks, transit because they have too much missingness
df.drop(columns=["grocery_pharm", "parks", "transit", "residential"], inplace = True)

# fill na's with mean for that state that date for retail_rec, workplace [TO DO: IS THIS REASONABLE?]
for var in ["retail_rec", "workplace"]:
    df[var] = df[["state", "date", var]].groupby(["state", "date"]).transform(lambda x: x.fillna(x.mean()))

# fill missing masks with 0 because the missings are from MO, who doesn't have a mask mandate
df["mask_mandate"] = df["mask_mandate"].fillna(0)

# drop na's (should just be the last week for each fips)
df = df.dropna()

df.describe()

Unnamed: 0,cumulative_cases,cumulative_deaths,new_cases,new_deaths,new_cases_7avg,new_deaths_7avg,2weeksago_cases_7avg,2weeksago_deaths_7avg,total_pop,male,...,prev_day_adult_admit_60-69_7daysum,prev_day_adult_admit_70-79_7daysum,prev_day_adult_admit_80+_7daysum,prev_day_adult_admit_unknown_7daysum,mask_mandate,retail_rec,workplace,new doses,cumulative doses,cases_next_week
count,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,...,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0,75868.0
mean,5157.038956,93.936726,33.259596,0.545718,33.25261,0.490431,33.16306,0.488717,101064.9,49578.21,...,4.017741,4.449913,4.197409,0.431789,0.621079,-5.193752,-19.133926,164.652027,6151.541,33.395213
std,21473.510024,455.881761,139.108506,3.389662,132.495562,2.312191,132.189254,2.311772,336618.1,163332.4,...,14.216432,14.628187,13.88174,6.612432,0.485122,15.37665,13.507123,854.820695,32841.97,132.947069
min,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2027.0,976.0,...,0.0,0.0,0.0,0.0,0.0,-97.0,-90.0,0.0,0.0,0.0
25%,380.0,5.0,2.0,0.0,3.0,0.0,3.0,0.0,14640.0,7388.0,...,0.0,0.0,0.0,0.0,0.0,-12.530612,-26.0,0.0,0.0,3.0
50%,1231.0,21.0,7.0,0.0,8.0,0.0,8.0,0.0,32295.0,15723.0,...,0.0,0.0,0.0,0.0,1.0,-5.0,-17.0,0.0,0.0,8.0
75%,3351.0,60.0,22.0,0.0,23.0,0.0,23.0,0.0,66371.0,33474.0,...,3.0,3.0,3.0,0.0,1.0,2.833333,-10.057143,40.0,2062.0,23.0
max,503169.0,10257.0,6697.0,277.0,4654.0,70.0,4654.0,70.0,5198275.0,2522949.0,...,369.0,326.0,300.0,634.0,1.0,213.0,33.0,31335.0,1257623.0,4654.0


## Split Train Test

In [86]:
# split df into features and labels
X = df.drop(columns=["cases_next_week"])
y = df["cases_next_week"]

# split df into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

# df that's just training
df_train = pd.concat([X_train, y_train], axis=1)

## Pre-process data

In [88]:
# get list of numerical features for normalization
numerical_features = X_train.columns.tolist()[4:]
numerical_features.remove('mask_mandate')
numerical_features

# normalize numerical features
X_train, X_test = pline.normalize(X_train, X_test, numerical_features)

# one hot encode categorical variables
X_train = pline.one_hot_encode(X_train, ["state", "fips"])
X_test = pline.one_hot_encode(X_test, ["state", "fips"])

# drop county cause that's the same as fips
X_train = X_train.drop(columns=["county"])
X_test = X_test.drop(columns=["county"])

## Feature Selection

### Use Variance threshold to weed out features with zero variance
"This is one of the most simple approaches to feature selection. The scikit-learn library has a method called VarianceThreshold . This method takes a threshold value and when fitted to a feature set will remove any features below this threshold. The default value for the threshold is 0 and this will remove any features with zero variance, or in other words where all values are the same."


In [90]:
X = X_train
selector = VarianceThreshold()
print("Original feature shape:", X.iloc[:, 1:].shape)
new_X = selector.fit_transform(X.iloc[:, 1:])
print("Transformed feature shape:", new_X.shape)

Original feature shape: (60694, 387)
Transformed feature shape: (60694, 387)


Looks like there are no variables with zero variance, which is good!

### Narrow down to the top 50 features with SelectKBest

In [91]:
# get k highest scoring variables
fs = SelectKBest(score_func=mutual_info_regression, k="all")

# learn relationship from training data (drop non-numerical data)
fs.fit(X_train.drop(columns=["date"]), y_train.drop(columns=["date"]))

SelectKBest(k='all',
            score_func=<function mutual_info_regression at 0x123292158>)

In [102]:
feature_scores = pd.DataFrame({'variables': X_train.drop(columns=["date"]).columns.tolist(), 
                               'score': fs.scores_})


In [103]:
# top 50 variables
top50 = feature_scores.sort_values(by="score", ascending=False).head(50)["variables"].tolist()

In [105]:
X_train = X_train[["date"] + top50]
X_test = X_test[["date"] + top50]

In [106]:
X_train.columns

Index(['date', 'new_cases_7avg', 'new_cases', '2weeksago_cases_7avg',
       'total_pop', 'white', 'female', 'male', 'age_35_44', 'below_500_pov',
       'age_62over', 'age_45_54', 'below_400_pov', 'housing_units',
       'age_55_59', 'age_60_64', 'age_20_24', 'age_under14', 'age_25_34',
       'age_15_19', 'age_65over', 'below_300_pov', 'below_200_pov',
       'below_185_pov', 'below_150_pov', 'below_125_pov', 'non_white',
       'below_pov', 'female_below_pov', 'male_below_pov', 'below_50_pov',
       'hispanic', 'black', 'cumulative_cases', 'asian',
       'total_adult_hospitalizations', 'other_race', 'native',
       'prev_day_adult_admit_7daysum', 'p_white', 'p_non_white',
       'cumulative_deaths', 'p_age_62over',
       'prev_day_adult_admit_70-79_7daysum',
       'prev_day_adult_admit_60-69_7daysum', 'p_black',
       'prev_day_adult_admit_80+_7daysum', 'retail_rec', 'age_median',
       'p_age_65over', 'prev_day_adult_admit_50-59_7daysum'],
      dtype='object')

In [107]:
X_train.shape

(60694, 51)

### Perform some additional feature selection via Lasso regularization

In [138]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [139]:
sel_ = SelectFromModel(Lasso(alpha=0.2))
sel_.fit(X_train.drop(columns=["date"]), y_train.drop(columns=["date"]))

SelectFromModel(estimator=Lasso(alpha=0.2))

In [140]:
selected_feat = X_train.drop(columns=["date"]).columns[(sel_.get_support())]
print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 51
selected features: 15
features with coefficients shrank to zero: 34


In [141]:
selected_feat

Index(['new_cases_7avg', 'new_cases', '2weeksago_cases_7avg', 'white',
       'age_35_44', 'age_45_54', 'cumulative_cases', 'asian',
       'total_adult_hospitalizations', 'other_race',
       'prev_day_adult_admit_7daysum', 'p_white', 'retail_rec', 'p_age_65over',
       'prev_day_adult_admit_50-59_7daysum'],
      dtype='object')

In [153]:
final_features = ["date"] + selected_feat.tolist() 
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

## Export final train and test sets

In [163]:
X_train_final.to_csv("../Data/Train-Test Set/X_train.csv", index=False)
y_train.to_csv("../Data/Train-Test Set/y_train.csv", index=False)

X_test_final.to_csv("../Data/Train-Test Set/X_test.csv", index=False)
y_test.to_csv("../Data/Train-Test Set/y_test.csv", index=False)