# Summer Analytics Hackathon - 2024

The final submission is `submission.csv`.

The procedure followed is as follows:
- Load the datasets into `train_set`, `test_set` and `train_labels`.
- Preprocess the data by removing the `ID` column and dummy encoding the necessary columns.
- The columns `health_insurance`, `employment_occupation`, `employment_industry` had 45-55% missing values. So, they were dropped to avoid incorrect imputation.
- `race`, `marital_status` and `rent_or_own` were dropped as they contribute little to none to the probability of a person getting a `vaccine`
- The remaining columns were imputed by:
  - `median` for numerical columns(to avoid outliers)
  - `mode` for categorical columns
- The model used was a Logistic Regression model.
- The model was trained on the training set by dividing it into `train` and `validation` sets.
- The model was then used to predict the `test_set`.
- The predictions were saved in `submission.csv`.

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
test_set = pd.read_csv('datasets/test_set_features.csv')
train_set = pd.read_csv('datasets/training_set_features.csv')
train_labels = pd.read_csv('datasets/training_set_labels.csv')

In [39]:
train_set

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea
26704,26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,
26705,26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg


In [40]:
train_labels

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0,0
1,1,0,1
2,2,0,0
3,3,0,1
4,4,0,0
...,...,...,...
26702,26702,0,0
26703,26703,0,0
26704,26704,0,1
26705,26705,0,0


In [41]:
train_set.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [42]:
train_set.isnull().sum()

respondent_id                      0
xyz_concern                       92
xyz_knowledge                    116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_xyz_vacc_effective       391
opinion_xyz_risk                 388
opinion_xyz_sick_from_vacc       395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
m

In [43]:
train_labels.drop(columns=['respondent_id'], inplace=True)

In [44]:
def preprocess_data(df):
    df.drop(columns=['respondent_id', 'employment_occupation', 'employment_industry', 'hhs_geo_region', 'rent_or_own', 'race', 'marital_status', 'health_insurance', 'employment_status'], inplace=True)
    df['household_adults'].fillna(df['household_adults'].median(), inplace=True)
    df['household_children'].fillna(train_set['household_children'].median(), inplace=True)

    for col in df.columns:
        df[col].fillna(df[col].mode()[0], inplace=True)
        
    df['sex'] = np.where(df['sex'] == 'Male', 1, 0)
    df = pd.get_dummies(df, drop_first=True, columns=['age_group', 'education', 'income_poverty', 'census_msa'], dtype=float)
    return df

In [45]:
train_set.dtypes

respondent_id                    int64
xyz_concern                    float64
xyz_knowledge                  float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_xyz                float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
health_insurance               float64
opinion_xyz_vacc_effective     float64
opinion_xyz_risk               float64
opinion_xyz_sick_from_vacc     float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                            object
sex                      

In [46]:
from sklearn.preprocessing import MinMaxScaler

train_set = preprocess_data(train_set)
test_set = preprocess_data(test_set)

scaler = MinMaxScaler()
train_set = pd.DataFrame(scaler.fit_transform(train_set), columns=train_set.columns)
test_set = pd.DataFrame(scaler.transform(test_set), columns=test_set.columns)

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_set, train_labels, test_size=0.2, random_state=69)

In [48]:
X_train_xyz = X_train.drop(X_train.columns[X_train.columns.str.contains('seas') | X_train.columns.str.contains('seasonal')], axis=1)
X_test_xyz = X_test.drop(X_test.columns[X_test.columns.str.contains('seas') | X_test.columns.str.contains('seasonal')], axis=1)
X_train_seas = X_train.drop(X_train.columns[X_train.columns.str.contains('xyz')], axis=1)
X_test_seas = X_test.drop(X_test.columns[X_test.columns.str.contains('xyz')], axis=1)

In [49]:
from sklearn.linear_model import LogisticRegression

log_reg_xyz = LogisticRegression(random_state=69, max_iter=1000)
log_reg_seas = LogisticRegression(random_state=69, max_iter=1000)

log_reg_xyz.fit(X_train, y_train['xyz_vaccine'])
log_reg_seas.fit(X_train, y_train['seasonal_vaccine'])

In [50]:
log_reg_xyz.score(X_test, y_test['xyz_vaccine']), log_reg_seas.score(X_test, y_test['seasonal_vaccine'])

(0.8335829277424186, 0.775926619243729)

In [51]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test['xyz_vaccine'], log_reg_xyz.predict_proba(X_test)[:, 1]), roc_auc_score(y_test['seasonal_vaccine'], log_reg_seas.predict_proba(X_test)[:, 1])

(0.8347507492990627, 0.8502903526976788)

In [52]:
log_reg_xyz_final = LogisticRegression(random_state=69, max_iter=1000)
log_reg_seas_final = LogisticRegression(random_state=69, max_iter=1000)

log_reg_xyz_final.fit(train_set, train_labels['xyz_vaccine'])
log_reg_seas_final.fit(train_set, train_labels['seasonal_vaccine'])

test_set_xyz = test_set.drop(test_set.columns[test_set.columns.str.contains('seas') | test_set.columns.str.contains('seasonal')], axis=1)
test_set_seas = test_set.drop(test_set.columns[test_set.columns.str.contains('xyz')], axis=1)

xyz_vaccine = log_reg_xyz_final.predict_proba(test_set)[:, 1]
seasonal_vaccine = log_reg_seas_final.predict_proba(test_set)[:, 1]

In [53]:
ans = pd.DataFrame({'respondent_id': np.arange(26708), 'xyz_vaccine': xyz_vaccine, 'seasonal_vaccine': seasonal_vaccine})
ans

Unnamed: 0,respondent_id,xyz_vaccine,seasonal_vaccine
0,0,0.093692,0.328595
1,1,0.057466,0.058763
2,2,0.464884,0.682178
3,3,0.459198,0.882436
4,4,0.240094,0.525984
...,...,...,...
26703,26703,0.318293,0.496942
26704,26704,0.086785,0.279328
26705,26705,0.125073,0.186816
26706,26706,0.050472,0.326506


In [54]:
ans.to_csv('submission.csv', index=False)