In [179]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [180]:
 training_set_features = pd.read_csv('/content/training_set_features.csv')
 training_set_labels = pd.read_csv('/content/training_set_labels.csv')
test_set_features = pd.read_csv('/content/test_set_features.csv')

In [181]:
training_set_features.drop(['employment_industry','employment_occupation'], axis=1,inplace=True)

In [182]:
test_set_features.drop(['employment_industry','employment_occupation'], axis=1,inplace=True)

In [183]:
X=training_set_features.iloc[:,1:]
y_xyz=training_set_labels.iloc[:,1]#xyz_vaccine
y_seasonal=training_set_labels.iloc[:,2]#seasonal_vaccine

In [184]:
X_testset=test_set_features.iloc[:,1:]

In [185]:
ordinal_features = [ 'age_group','education', 'income_poverty']
nominal_features = [ 'race', 'sex', 'marital_status',
                    'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa']
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features = [feature for feature in numeric_features if feature not in ordinal_features]

In [186]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

In [187]:
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])

In [188]:
nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [189]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('ord', ordinal_transformer, ordinal_features),
        ('nom', nominal_transformer, nominal_features)])

In [190]:
X_preprocessed=preprocessor.fit_transform(X)

In [191]:
X_train_xyz,X_test_xyz,y_train_xyz,y_test_xyz=train_test_split(X_preprocessed,y_xyz,test_size=0.25,random_state=42)

In [192]:
lr1=LogisticRegression()
lr1.fit(X_train_xyz,y_train_xyz)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [193]:
pred_prob1 = lr1.predict_proba(X_test_xyz)

In [194]:
auc_score_xyz = roc_auc_score(y_test_xyz, pred_prob1[:,1])
print(auc_score_xyz)

0.8297525162945333


In [195]:
y_testset_xyz=lr1.predict(X_testset_preprocessed)

In [196]:
X_testset_preprocessed=preprocessor.transform(X_testset)

In [197]:
X_train_sea,X_test_sea,y_train_sea,y_test_sea=train_test_split(X_preprocessed,y_seasonal,test_size=0.25,random_state=42)

In [198]:
lr2=LogisticRegression()
lr2.fit(X_train_sea,y_train_sea)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [199]:
pred_prob2= lr2.predict_proba(X_test_sea)

In [200]:
auc_score_sea = roc_auc_score(y_test_sea, pred_prob2[:,1])
print(auc_score_sea)

0.8522750681797916


In [201]:
y_testset_seasonal=lr2.predict(X_testset_preprocessed)

In [202]:
result = pd.DataFrame({'respondent_id':test_set_features.iloc[:,0],'xyz_vaccine': y_testset_xyz, 'seasonal_vaccine':y_testset_seasonal })

In [203]:
result.to_csv('results.csv', index=False)