In [196]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [197]:
 training_set_features = pd.read_csv('/content/training_set_features.csv')
 training_set_labels = pd.read_csv('/content/training_set_labels.csv')
test_set_features = pd.read_csv('/content/test_set_features.csv')

In [198]:
training_set_features.drop(['employment_industry','employment_occupation'], axis=1,inplace=True)

In [199]:
test_set_features.drop(['employment_industry','employment_occupation'], axis=1,inplace=True)

In [200]:
X=training_set_features.iloc[:,1:]
y_xyz=training_set_labels.iloc[:,1]#xyz_vaccine
y_seasonal=training_set_labels.iloc[:,2]#seasonal_vaccine

In [201]:
X_testset=test_set_features.iloc[:,1:]

In [202]:
ordinal_features = [ 'age_group','education', 'income_poverty']
nominal_features = [ 'race', 'sex', 'marital_status',
                    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa']
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features = [feature for feature in numeric_features if feature not in ordinal_features]

In [203]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

In [204]:
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

In [205]:
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [206]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)])

In [207]:
X_preprocessed=preprocessor.fit_transform(X)

In [208]:
X_testset_preprocessed=preprocessor.transform(X_testset)

In [209]:
X_train_xyz,X_test_xyz,y_train_xyz,y_test_xyz=train_test_split(X_preprocessed,y_xyz,test_size=0.25,random_state=42)

In [210]:
lr1=LogisticRegression(max_iter=1000)
lr1.fit(X_train_xyz,y_train_xyz)

In [211]:
pred_prob1 = lr1.predict_proba(X_test_xyz)

In [212]:
auc_score_xyz = roc_auc_score(y_test_xyz, pred_prob1[:,1])
print(auc_score_xyz)

0.8297326596381259


In [213]:
prob_testset_xyz=lr1.predict_proba(X_testset_preprocessed)

In [214]:
X_train_sea,X_test_sea,y_train_sea,y_test_sea=train_test_split(X_preprocessed,y_seasonal,test_size=0.25,random_state=42)

In [215]:
lr2=LogisticRegression(max_iter=1000)
lr2.fit(X_train_sea,y_train_sea)

In [216]:
pred_prob2= lr2.predict_proba(X_test_sea)

In [217]:
auc_score_sea = roc_auc_score(y_test_sea, pred_prob2[:,1])
print(auc_score_sea)

0.8522735308676896


In [218]:
prob_testset_seasonal=lr2.predict_proba(X_testset_preprocessed)

In [219]:
result = pd.DataFrame({'respondent_id':test_set_features.iloc[:,0],'xyz_vaccine_received': prob_testset_xyz[:,1], 'seasonal_vaccine_received':prob_testset_seasonal[:,1] })

In [220]:
result.to_csv('results.csv', index=False)