In [88]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (20, 10)

In [138]:
def get_coefficients(X_encoded, model):
    # Get the feature names after one-hot encoding
    feature_names = X_encoded.columns
    # Get the coefficients from the model
    coefficients = model.coef_[0]
    # Create a DataFrame to hold the coefficients and feature names
    coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
    return coef_df.sort_values(by='Coefficient', ascending=False)

In [4]:
# Load in data
train_features = pd.read_csv('data/training_set_features.csv')
train_labels = pd.read_csv('data/training_set_labels.csv')
test_features = pd.read_csv('data/test_set_features.csv')

In [125]:
# Merge training features and labels
train_data = pd.merge(train_features, train_labels, on='respondent_id').set_index('respondent_id').drop(['h1n1_vaccine'], axis=1)

In [126]:
# Examine possible feature choices
possible_features = train_data.columns
print(possible_features)

Index(['h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation', 'seasonal_vaccine'],
      dtype='object')


In [127]:
# Choose features
chosen_features = [
    'doctor_recc_seasonal',
    'behavioral_avoidance', 
    'behavioral_face_mask', 
    'behavioral_wash_hands',
    'behavioral_large_gatherings',
    'behavioral_outside_home',
    'behavioral_touch_face',
    'health_worker',
    'health_insurance',
    'opinion_h1n1_vacc_effective',
    'education',
    'rent_or_own',
    'sex'
]

In [128]:
# Separate X and y
target = 'seasonal_vaccine'
X = train_data.drop(columns=[target])[chosen_features]
y = train_data[target]

# Handle nulls
percent_null = X.isnull().sum() / len(X)

# See features with > 30% null values
percent_null[percent_null > 0.3]

health_insurance    0.45958
dtype: float64

In [129]:
# Health insurance data is usually available when one has it, so we will assume missing insurance data means no insurance
X['health_insurance'].fillna(0, inplace=True)

# Train-test-split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# Since all of our features are categorical features, we will fill the nulls with the mode of each feature
from sklearn.impute import SimpleImputer

def simple_impute(*Xs):
    results = []
    imputer = SimpleImputer(strategy='most_frequent')
    for X in Xs:
        X_imputed = imputer.fit_transform(X)
        X_imputed = pd.DataFrame(X_imputed, columns=X.columns)
        results.append(X_imputed)
    return tuple(results)

X_train_imputed, X_test_imputed = simple_impute(X_train, X_test)

# Now that the nulls have been filled, we can one hot encode X

# Define one hot encode function
def ohe(*Xs, features):
    result = []
    for X in Xs:
        X_modified = pd.get_dummies(X, columns=features)
        result.append(X_modified)
    return tuple(result)

categoricals = [f for f in X.columns if X[f].dtype == 'object']

X_train_encoded, X_test_encoded = ohe(X_train_imputed, X_test_imputed, features=categoricals)

In [145]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

baseline_model = LogisticRegression(random_state=42)
baseline_model.fit(X_train_encoded, y_train)
baseline_pred = baseline_model.predict(X_test_encoded)

baseline_acc = accuracy_score(y_test, baseline_pred)
baseline_f1 = f1_score(y_test, baseline_pred)

print(f'Accuracy of baseline model: {baseline_acc:,.4f}')
print(f'F1 score of baseline model: {baseline_f1:,.4f}')

Accuracy of baseline model: 0.7050
F1 score of baseline model: 0.6413


In [146]:
baseline_model_coefs = get_coefficients(X_train_encoded, baseline_model)

print('Top 5 Determinants of Getting the Seasonal Flu Vaccine')
baseline_model_coefs[:5]

Top 5 Determinants of Getting the Seasonal Flu Vaccine


Unnamed: 0,Feature,Coefficient
0,doctor_recc_seasonal,1.546684
7,health_worker,0.641753
8,health_insurance,0.427955
9,opinion_h1n1_vacc_effective,0.350735
14,rent_or_own_Own,0.260141


In [134]:
X_af = train_data.drop(columns=[target])
X_af['health_insurance'].fillna(0, inplace=True)
X_train_af, X_test_af, y_train, y_test = train_test_split(X_af, y, test_size=.2, random_state=42)
X_train_af_imputed, X_test_af_imputed = simple_impute(X_train_af, X_test_af)
categoricals_af = [f for f in X_af.columns if X_af[f].dtype == 'object']
X_train_af_encoded, X_test_af_encoded = ohe(X_train_af_imputed, X_test_af_imputed, features=categoricals_af)

model_af = LogisticRegression(random_state=42, max_iter=1000)
model_af.fit(X_train_af_encoded, y_train)
model_af_pred = model_af.predict(X_test_af_encoded)

model_af_acc = accuracy_score(y_test, model_af_pred)
model_af_f1 = f1_score(y_test, model_af_pred)

print(f'Accuracy of af model: {model_af_acc:,.4f}')
print(f'F1 score of af model: {model_af_f1:,.4f}')

Accuracy of af model: 0.7870
F1 score of af model: 0.7636


In [144]:
model_af_coefs = get_coefficients(X_train_af_encoded, model_af)

print('Top 5 Determinants of Getting the Seasonal Flu Vaccine')
model_af_coefs[:5]

Top 5 Determinants of Getting the Seasonal Flu Vaccine


Unnamed: 0,Feature,Coefficient
10,doctor_recc_seasonal,1.454063
66,employment_industry_haxffmxo,1.441403
85,employment_occupation_dcjcmpih,1.441403
27,age_group_65+ Years,0.942953
70,employment_industry_msuufmds,0.652543


In [102]:
train_data.employment_industry

respondent_id
0             NaN
1        pxcmvdjn
2        rucpziij
3             NaN
4        wxleyezf
           ...   
26702         NaN
26703    fcxhlnwr
26704         NaN
26705    fcxhlnwr
26706         NaN
Name: employment_industry, Length: 26707, dtype: object

In [73]:
from sklearn.ensemble import RandomForestClassifier

model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
model_2.fit(X_train_encoded, y_train)
model_2_pred = model_2.predict(X_test_encoded)

model_2_acc = accuracy_score(y_test, model_2_pred)
model_2_f1 = f1_score(y_test, model_2_pred)

print(f'Accuracy of baseline model: {model_2_acc:,.4f}')
print(f'F1 score of baseline model: {model_2_f1:,.4f}')

Accuracy of baseline model: 0.6001
F1 score of baseline model: 0.5459
