### Implement Logistic Regression

In [6]:
import pandas as pd
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [7]:
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

X_test = pd.read_csv('./data/X_test.csv')
y_test = pd.read_csv('./data/y_test.csv')

In [8]:
display(X_train.head())
display(y_train.head())
display(X_test.head())
display(y_test.head())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.679436,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,-1.448623,-0.531568,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,-0.860782,1.129841,0.202248,-0.837116,-1.385296
1,-0.679436,-0.424715,4.412877,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,0.465442,1.209008,0.641301,-0.930622,-0.56272,-0.860782,1.129841,1.416875,1.559663,0.777763
2,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,-2.784094,-0.827124,-0.786206,-0.930622,1.777084,-0.860782,-0.652099,1.416875,-0.237921,-0.664276
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,-1.448623,1.881227,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,0.794763,0.417065,-1.012379,1.559663,1.210374
4,-1.777982,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,0.465442,1.209008,-0.786206,-0.930622,1.777084,-0.860782,1.129841,1.416875,-0.687317,-1.241092


Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,1,1
1,0,1
2,0,1
3,1,1
4,1,1


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.679436,-2.042478,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-1.700915,-0.827124,2.068808,1.074551,1.777084,2.450308,0.060677,1.416875,-1.286512,0.921966
1,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,1.881227,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,0.794763,1.129841,-1.012379,0.361274,1.06617
2,1.517658,1.193048,-0.22661,-1.626185,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,-0.860782,-0.652099,-1.012379,-0.837116,-1.385296
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,0.465442,1.209008,-0.786206,1.074551,-0.56272,0.794763,-1.008487,0.202248,-0.837116,-1.385296
4,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,0.465442,-0.827124,-0.786206,-0.930622,1.777084,0.794763,0.773453,1.416875,0.81067,-1.096888


Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0


In [9]:
X_train.shape

(16024, 35)

In [10]:
X_train, y_train = make_multilabel_classification(n_features=35, 
                                                  n_classes=2, 
                                                  random_state=2)

In [11]:
pipeline = Pipeline([
               ('clf',MultiOutputClassifier(LogisticRegression(penalty='l2',
                                                      dual=False,
                                                      tol=0.0001,
                                                      fit_intercept=True,
                                                      intercept_scaling=1,
                                                      class_weight=None,
                                                      max_iter=900,
                                                      multi_class='auto',
                                                      warm_start=False,
                                                      n_jobs=-1,
                                                      random_state=None,
                                                      l1_ratio=None)
                                  )),
                ])

In [12]:
from sklearn.model_selection import GridSearchCV

grid_values = {
    'clf__estimator__solver':['newton-cg', 'lbfgs', 'sag', 'saga'],
    'clf__estimator__C':[1.0, 10.0, 100.0, 200, 300, 1000.0]
}

grid_model = GridSearchCV(pipeline, param_grid=grid_values)

best_model = grid_model.fit(X_train, y_train)

In [13]:
print('Best solver:', best_model.best_estimator_.get_params()['clf__estimator__solver'])
print('Best C:', best_model.best_estimator_.get_params()['clf__estimator__C'])

Best solver: newton-cg
Best C: 1.0


In [14]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,-0.679436,-2.042478,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-1.700915,-0.827124,2.068808,1.074551,1.777084,2.450308,0.060677,1.416875,-1.286512,0.921966
1,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,1.881227,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,0.794763,1.129841,-1.012379,0.361274,1.06617
2,1.517658,1.193048,-0.22661,-1.626185,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,0.465442,-0.827124,0.641301,-0.930622,-0.56272,-0.860782,-0.652099,-1.012379,-0.837116,-1.385296
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,0.465442,1.209008,-0.786206,1.074551,-0.56272,0.794763,-1.008487,0.202248,-0.837116,-1.385296
4,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,0.465442,-0.827124,-0.786206,-0.930622,1.777084,0.794763,0.773453,1.416875,0.81067,-1.096888


In [19]:
y_preds = best_model.predict(X_test)

In [20]:
print(predictions)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 1]
 [0 0]]


In [18]:
#Import roc_curve, auc
from sklearn.metrics import roc_curve, auc

#Calculate the probability scores of each point in the training set
y_train_score = model_log.decision_function(X_train)

# Calculate the fpr, tpr, and thresholds for the training set
train_fpr, train_tpr, thresholds = roc_curve(y_train, y_train_score)

#Calculate the probability scores of each point in the test set
y_test_score = model_log.decision_function(X_test)

#Calculate the fpr, tpr, and thresholds for the test set
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_score)

NameError: name 'model_log' is not defined

In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_preds)

0.23298698867359355

In [29]:
display(y_test)
display(y_preds)


Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0
...,...,...
10678,0,0
10679,0,0
10680,0,0
10681,0,0


array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 1],
       [0, 0]])

In [26]:
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt

# y_test = # true labels
# y_preds = # predicted results
fpr, tpr, thresholds = metrics.roc_curve(np.squeeze(y_test), np.squeeze(y_preds), pos_label=0)

# Print ROC curve
plt.plot(fpr,tpr)
plt.show() 

# Print AUC
auc = np.trapz(tpr,fpr)
print('AUC:', auc)

ValueError: multilabel-indicator format is not supported

In [None]:
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": predictions[0:1],
        "seasonal_vaccine": predictions[1:],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.value_counts()