#### Use the make_classification() function to create a synthetic binary classification problem with 1,000 examples and 20 input features. Use this synthetic dataset to build a classification model using Random forest classifier. Evaluate your model using stratified cross fold validation. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score

In [2]:
X, y = make_classification(n_samples=100, n_features=20, random_state=42)

In [3]:
np.bincount(y)

array([50, 50])

In [4]:
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=0.2,
                                                     random_state=42)

In [5]:
pca = PCA(n_components=17, random_state=42)

X_train = pca.fit_transform(X_train)
pca.explained_variance_ratio_.sum()

0.9825652811186271

In [6]:
rfc = RandomForestClassifier(random_state=42, n_jobs=-1)
params = {'n_estimators':[200, 400],'max_features': [13,17]}

gs = GridSearchCV(estimator=rfc, param_grid=params, scoring='f1_micro', cv=4)

In [7]:
gs.fit(X_train, y_train)

GridSearchCV(cv=4, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid={'max_features': [13, 17], 'n_estimators': [200, 400]},
             scoring='f1_micro')

In [8]:
print(gs.best_params_)
print(gs.best_estimator_)
print(gs.best_score_)

{'max_features': 13, 'n_estimators': 200}
RandomForestClassifier(max_features=13, n_estimators=200, n_jobs=-1,
                       random_state=42)
0.8875


In [9]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

f1_kf = cross_val_score(gs, X_train, y_train, cv=skf, scoring='f1')
acc_kf = cross_val_score(gs, X_train, y_train, cv=skf,scoring='accuracy')

In [10]:
print('KFold on Train set')
print('F1 Score {0:.4f}'.format(f1_kf.mean()))
print('Accuracy Score {0:.4f}'.format(acc_kf.mean()))

KFold on Train set
F1 Score 0.9534
Accuracy Score 0.9500


In [11]:
X_test = pca.fit_transform(X_test)
y_pred = gs.predict(X_test)
print('Test set')
print('F1 Score {0:.4f}'.format(f1_score(y_test, y_pred)))
print('Accuracy Score {0:.4f}'.format(accuracy_score(y_test, y_pred)))

Test set
F1 Score 0.6667
Accuracy Score 0.7000


### Observing:
* F1 Score as 0.9435 and 6667 on StratifiedKFold(train set) and Test set respectively.
* Accuracy Score as 0.9375 and 0.7000 on StratifiedKFold(train set) and Test set respectively.