In [None]:

import numpy as np 
import pandas as pd 
from sklearn import datasets, ensemble, model_selection
import matplotlib.pyplot as plt
import pandas_profiling as pp
import seaborn as sns

In [None]:
data=datasets.load_breast_cancer()

In [None]:
print(data.DESCR)

In [None]:
data.feature_names

In [None]:
x,y=data.data,data.target

In [None]:
df=pd.DataFrame(data=data.data,columns=data.feature_names)
df['Target']=y

In [None]:
df.describe()

In [None]:
pp.ProfileReport(df)

In [None]:
cols=list(df.columns[:5])
sns.pairplot(df[cols])


In [None]:
sns.pairplot(df,hue='Target',vars=df.columns[:5])

In [None]:
xtr,xval,ytr,yval=model_selection.train_test_split(x,y,test_size=.3,random_state=11)

In [None]:
model=ens.RandomForestClassifier(n_estimators=10,oob_score=True,random_state=25)

In [None]:
model.fit(xtr,ytr)
model.score(xtr,ytr)

In [None]:
model.oob_score_

In [None]:
model.score(xval,yval)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
pred=model.predict(xval)
confusion_matrix(yval,pred)

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
#     classes = classes[unique_labels(y_true, y_pred)]
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
plot_confusion_matrix(yval,pred,['Malignant','Benign'])

In [None]:
model.oob_score_

What would happen if we increase the number of trees?

In [None]:

model=ensemble.RandomForestClassifier(n_estimators=30,oob_score=True,random_state=25)
model.fit(xtr,ytr)
model.score(xtr,ytr),model.oob_score_,model.score(xval,yval)

What would happen if we set a value for max_features? By default it is set to 'auto' which uses square root of total number of features.

In [None]:
model=ensemble.RandomForestClassifier(n_estimators=10,max_features=5,oob_score=True,random_state=25)
model.fit(xtr,ytr)
model.score(xtr,ytr),model.oob_score_,model.score(xval,yval)

Let's change other parameters in the model. This time we set max_depth to 3. If this value is not set the tree grows until it reaches the minimum number of data points (min_sample_split) in every leaf.

In [None]:
model=ensemble.RandomForestClassifier(n_estimators=10,max_depth=3,oob_score=True,random_state=25)
model.fit(xtr,ytr)
model.score(xtr,ytr),model.oob_score_,model.score(xval,yval)

What if we use max_depth of 3 and more trees?

In [None]:
model=ensemble.RandomForestClassifier(n_estimators=50,max_depth=3,oob_score=True,random_state=25)
model.fit(xtr,ytr)
model.score(xtr,ytr),model.oob_score_,model.score(xval,yval)

# Hyperparameter Optimization
As you can see performance of the model relies on the values of the model parameters. How do we know what values we should choose? These parameters that define the model are called hyperparameter. To find the best values we need to perform hyperparameter optimization. There are various methods for hyperparameter optimization. What we are using here is called grid search. Basically, we choose a few values for some of the parameters and try every combination. Obviously, this method is not practicall if we have many hyperparameters and a wide range of values for each. 

In [None]:
params={'max_features':[4,5,6,7],'max_depth':[2,3,4,5],'n_estimators':[20,50,100]}
grid_search=model_selection.GridSearchCV(estimator=ensemble.RandomForestClassifier(random_state=21,oob_score=True),
                                         param_grid=params,
                                         cv=4)

So, we created a dictionary containing the parameters we want to set and the values we want to try. We passed the model we want to train, as well as the parameter values we want to try and cv! What is cv? Cross Validation.
Cross Validation split the data into n folds. Then, the model is trained using n-1 folds and then tested (validated) on the fold that we haven't used. We can do this n times (one for each fold) and take the average of the validation score. This is a useful method to make sure that a good score wasn't just by chance. 
We are using cv=5 here which means we are splitting the data into 5 sets.


In [None]:
# depends on the number of trials this might take some time to run.
grid_search.fit(xtr,ytr)

In [None]:
grid_search.best_params_

In [None]:
model=grid_search.best_estimator_
model.score(xtr,ytr),model.oob_score_,model.score(xval,yval)

In [None]:
plt.bar(data.feature_names,model.feature_importances_);
plt.xticks(rotation=90);

We managed to slightly improve the model. Is there room for more improvement? Certainly. We could do some feature engineering, which sometimes significantly improve the model. As we saw in the plots some of the features highly correlate. That means all the information we need, we can get from one of them. Having both of them in the dataset is unnecessary and most of the time even lowers the model accuracy. Also, Sometimes removing features that have low importance (according to the plot above) can also improve the model.