### Implement RandomForest

In [4]:
import pandas as pd
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

In [5]:
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')

X_test = pd.read_csv('./data/X_test.csv')
y_test = pd.read_csv('./data/y_test.csv')

In [6]:
X_train = X_train.iloc[:,1:]

In [7]:
X_test = X_test.iloc[:,1:]

In [8]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.679436,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,-1.448623,-0.531568,...,-0.827124,1.155399,0.641301,-0.930622,-0.56272,-0.860782,1.129841,0.202248,-0.837116,-1.385296
1,-0.679436,-0.424715,4.412877,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,1.209008,0.247673,0.641301,-0.930622,-0.56272,-0.860782,1.129841,1.416875,1.559663,0.777763
2,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,-0.827124,-1.567779,-0.786206,-0.930622,1.777084,-0.860782,-0.652099,1.416875,-0.237921,-0.664276
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,-1.448623,1.881227,...,-0.827124,0.247673,0.641301,-0.930622,-0.56272,0.794763,0.417065,-1.012379,1.559663,1.210374
4,-1.777982,1.193048,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,1.209008,1.155399,-0.786206,-0.930622,1.777084,-0.860782,1.129841,1.416875,-0.687317,-1.241092


In [9]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.679436,-2.042478,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-0.827124,-1.567779,2.068808,1.074551,1.777084,2.450308,0.060677,1.416875,-1.286512,0.921966
1,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,1.881227,...,-0.827124,1.155399,0.641301,-0.930622,-0.56272,0.794763,1.129841,-1.012379,0.361274,1.06617
2,1.517658,1.193048,-0.22661,-1.626185,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,-0.827124,1.155399,0.641301,-0.930622,-0.56272,-0.860782,-0.652099,-1.012379,-0.837116,-1.385296
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,1.209008,0.247673,-0.786206,1.074551,-0.56272,0.794763,-1.008487,0.202248,-0.837116,-1.385296
4,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-0.827124,0.247673,-0.786206,-0.930622,1.777084,0.794763,0.773453,1.416875,0.81067,-1.096888


In [10]:
y_train.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,1,1
1,0,1
2,0,1
3,1,1
4,1,1


In [11]:
y_test.head()

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
0,0,0
1,1,1
2,1,1
3,1,1
4,0,0


In [12]:
X_train.shape

(16024, 37)

In [13]:
X_train, y_train = make_multilabel_classification(n_features=37, 
                                                  n_classes=2, 
                                                  random_state=2)

In [14]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
               ('clf',MultiOutputClassifier(RandomForestClassifier(
                                                random_state=20,
                                                n_estimators=200,
                                                max_depth=15
                                            )
                                  )),
                ])

In [None]:
from sklearn.model_selection import GridSearchCV

grid_values = {
    'clf__estimator__bootstrap': [True],
    'clf__estimator__max_depth': [80, 90, 100, 110],
    'clf__estimator__max_features': [2, 3],
    'clf__estimator__min_samples_leaf': [3, 4, 5],
    'clf__estimator__min_samples_split': [8, 10, 12],
    'clf__estimator__n_estimators': [100, 200, 300, 1000]
}

grid_model = GridSearchCV(pipeline, param_grid=grid_values)

best_model = grid_model.fit(X_train, y_train)

In [17]:
print('Best max_depth:', best_model.best_estimator_.get_params()['clf__estimator__max_depth'])
print('Best max_features:', best_model.best_estimator_.get_params()['clf__estimator__max_features'])
print('Best min_samples_leaf:', best_model.best_estimator_.get_params()['clf__estimator__min_samples_leaf'])
print('Best min_samples_split:', best_model.best_estimator_.get_params()['clf__estimator__min_samples_split'])
print('Best n_estimators:', best_model.best_estimator_.get_params()['clf__estimator__n_estimators'])

Best max_depth: 80
Best max_features: 3
Best min_samples_leaf: 3
Best min_samples_split: 8
Best n_estimators: 300


In [18]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27,28,29,30,31,32,33,34,35,36
0,-0.679436,-2.042478,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-0.827124,-1.567779,2.068808,1.074551,1.777084,2.450308,0.060677,1.416875,-1.286512,0.921966
1,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,1.401639,0.690311,1.881227,...,-0.827124,1.155399,0.641301,-0.930622,-0.56272,0.794763,1.129841,-1.012379,0.361274,1.06617
2,1.517658,1.193048,-0.22661,-1.626185,-0.272201,0.459586,-0.747788,-0.71345,0.690311,1.881227,...,-0.827124,1.155399,0.641301,-0.930622,-0.56272,-0.860782,-0.652099,-1.012379,-0.837116,-1.385296
3,0.419111,-0.424715,-0.22661,0.614936,-0.272201,0.459586,-0.747788,-0.71345,0.690311,-0.531568,...,1.209008,0.247673,-0.786206,1.074551,-0.56272,0.794763,-1.008487,0.202248,-0.837116,-1.385296
4,1.517658,1.193048,-0.22661,0.614936,-0.272201,0.459586,1.337277,1.401639,0.690311,-0.531568,...,-0.827124,0.247673,-0.786206,-0.930622,1.777084,0.794763,0.773453,1.416875,0.81067,-1.096888


In [19]:
predictions = best_model.predict(X_test)

In [20]:
print(predictions)

[[1 0]
 [1 0]
 [1 0]
 ...
 [1 0]
 [1 0]
 [1 0]]


In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,predictions)

0.04081250585041655

y_preds = pd.DataFrame(
    {
        "h1n1_vaccine": predictions[0:1],
        "seasonal_vaccine": predictions[1:],
    },
    index = y_test.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.value_counts()