In [2]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
import lightgbm as lgbm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC

In [3]:
train_features = pd.read_csv("data/training_set_features.csv", index_col="respondent_id")
train_labels = pd.read_csv("data/training_set_labels.csv", index_col="respondent_id")
test_features = pd.read_csv("data/test_set_features.csv", index_col="respondent_id")
submission = pd.read_csv("data/submission_format.csv", index_col="respondent_id")
df = train_features.join(train_labels)

### Preparing the data

In [4]:
# Measure of association between h1n1 and seasonal vaccines
train_labels["h1n1_vaccine"].corr(train_labels["seasonal_vaccine"], method="pearson")

0.37714265306144684

In [5]:
num_features = list(train_features.select_dtypes(exclude=["object"]))
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_features = list(train_features.select_dtypes(include=["object"]))
cat_transformer = Pipeline([
    #("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [6]:
preprocessor = ColumnTransformer([
    ("num", num_transformer, num_features),
    ("cat", cat_transformer, cat_features),
])

In [7]:
train_features_prepared = preprocessor.fit_transform(train_features)

In [8]:
estimators = MultiOutputClassifier(estimator=LogisticRegression(penalty="l2", C=1))

In [9]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

In [10]:
voting_clf = VotingClassifier(estimators=[("lr", log_clf), ("rf", rnd_clf), ("svc", svm_clf)], voting="hard")

In [11]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("votting", voting_clf)
    #("classifier", KNeighborsClassifier()),
    #("estimators", estimators)
])

In [12]:
X_train, X_eval, y_train, y_eval = train_test_split(
    train_features_prepared,
    train_labels,
    test_size=0.2,
    shuffle=True,
    stratify=train_labels,
    random_state=6
)

In [13]:
full_pipeline.fit(X_train, y_train)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
print("model score: %.3f" % full_pipeline.score(X_eval, y_eval))

In [None]:
preds = full_pipeline.predict_proba(X_eval)

### Evaluation

In [14]:
def plot_roc(y_true, y_score, label_name, ax):
    fpr, tpr, thresholds = roc_curve(y_true, y_score)
    ax.plot(fpr, tpr)
    ax.plot([0, 1], [0, 1], color="grey", linestyle="--")
    ax.set_ylabel("TPR")
    ax.set_xlabel("FPR")
    ax.set_title(f"{label_name}: \n AUC = {roc_auc_score(y_true, y_score):.4f}")

In [None]:
y_preds = pd.DataFrame({
    "h1n1_vaccine": preds[0][:, 1],
    "seasonal_vaccine": preds[1][:, 1]},
    index=y_eval.index)

In [None]:
test_probas = full_pipeline.predict_proba(test_features)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(7, 3.5))

plot_roc(y_eval["h1n1_vaccine"], y_preds["h1n1_vaccine"], "h1n1_vaccine", ax=ax[0])
plot_roc(y_eval["seasonal_vaccine"], y_preds["seasonal_vaccine"], "seasonal_vaccine", ax=ax[1])

fig.tight_layout()

In [None]:
roc_auc_score(y_eval, y_preds)

In [None]:
full_pipeline.fit(train_features, train_labels)

### Prediction

In [None]:
test_probas = full_pipeline.predict_proba(test_features)

In [None]:
np.testing.assert_array_equal(test_features.index.values, submission.index.values)
submission["h1n1_vaccine"] = test_probas[0][:, 1]
submission["seasonal_vaccine"] = test_probas[1][:, 1]
submission.to_csv("prediction/submission.csv", index=True)