In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from classifier_test import classifier_test
import plotly.express as px


In [2]:
training_data = pd.read_csv('Resources/training_data.csv')
y_train = training_data.pop('malignant')
X_train = training_data

testing_data = pd.read_csv('Resources/testing_data.csv')
y_test = testing_data.pop('malignant')
X_test = testing_data

## Total Number of Trees

We will first investigate total number of trees allowed in the model. Due to the high default accuracy, we will run each level at numerous random states in order to reduce random noise. 

In [9]:
score_by_trees = []
random_state = []
number_of_trees = []
for i in range(1, 101, 2):
    for j in range(1, 100):
        model = RandomForestClassifier(n_estimators=i, random_state=j, n_jobs=-1)
        model.fit(X_train, y_train)
        random_state.append(j)
        number_of_trees.append(i)
        score_by_trees.append(model.score(X_test, y_test))

In [17]:
df = pd.DataFrame({'score': score_by_trees, 'random_state': random_state, 'number_of_trees': number_of_trees})

import plotly.express as px
fig = px.line(df, x=df.groupby('number_of_trees')['score'].mean().index, y=df.groupby('number_of_trees')['score'].mean())
fig.add_hline(y=0.990, line_dash='dash', line_color='red')
fig.update_xaxes(title_text='Number of Trees')
fig.update_yaxes(title_text='Accuracy Score')
fig.update_layout(title_text='Accuracy Score by Number of Trees')
fig.show()



### Optimal Number of Trees

There appears to be a continual upward trend as number of trees increases. In order to avoid over fitting, we define a somewhat arbitrary bound of 99% accuracy and see that the model crosses this bound with 21 trees.

In [67]:
optimal_number_of_trees = 23
model = RandomForestClassifier(n_estimators=optimal_number_of_trees, random_state=1)
classifier_test("Random Forest: Optimized Number of Trees", model, X_train, y_train, X_test, y_test)

Classifier: Random Forest: Optimized Number of Trees
Training Data Score: 99.87%
Testing Data Score: 98.00%

Confusion Matrix
          Predicted 0  Predicted 1
Actual 0          127            1
Actual 1            4          118

---------------------------------------------------



(0.9986666666666667, 0.98)

## Both Number of Trees and Max Depth

In [18]:
score_by_trees_3D = []
random_state_3D = []
number_of_trees_3D = []
max_depth_3D = []
for i in range(11, 31, 2):
    for j in range(1, 101):
        for k in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]:
            model = RandomForestClassifier(n_estimators=i, random_state=j, max_depth=k)
            model.fit(X_train, y_train)
            random_state_3D.append(j)
            number_of_trees_3D.append(i)
            max_depth_3D.append(k)
            score_by_trees_3D.append(model.score(X_test, y_test))

In [27]:
df_3d = pd.DataFrame({'score': score_by_trees_3D, 'random_state': random_state_3D, 'number_of_trees': number_of_trees_3D, "max_depth": max_depth_3D})
df_3d_mean = df_3d.groupby(['number_of_trees', 'max_depth'])['score'].mean().reset_index()

In [29]:

fig = px.scatter_3d(df_3d_mean, x='number_of_trees', y='max_depth', z='score', color='score')

fig.show()
fig.write_html('Plots/Random_Forest_3D.html')

In [30]:
df_3d_mean_optimal = df_3d_mean[df_3d_mean['number_of_trees'] == optimal_number_of_trees]
fig = px.line(df_3d_mean_optimal, x='max_depth', y='score')
fig.add_hline(y=0.990, line_dash='dash', line_color='red')
fig.update_xaxes(title_text='Max Depth')
fig.update_yaxes(title_text='Accuracy Score')
fig.update_layout(title_text='Accuracy Score by Max Depth')
fig.show()

Repeat for the region between 5 and 10

In [37]:
score_by_trees_3D = []
random_state_3D = []
number_of_trees_3D = []
max_depth_3D = []
for i in range(19, 25, 2):
    for j in range(1, 101):
        for k in [5, 6, 7, 8, 9, 10, 11, 12]:
            model = RandomForestClassifier(n_estimators=i, random_state=j, max_depth=k, n_jobs=-1)
            model.fit(X_train, y_train)
            random_state_3D.append(j)
            number_of_trees_3D.append(i)
            max_depth_3D.append(k)
            score_by_trees_3D.append(model.score(X_test, y_test))

In [38]:
df_3d = pd.DataFrame({'score': score_by_trees_3D, 'random_state': random_state_3D, 'number_of_trees': number_of_trees_3D, "max_depth": max_depth_3D})
df_3d_mean = df_3d.groupby(['number_of_trees', 'max_depth'])['score'].mean().reset_index()

In [39]:
fig = px.scatter_3d(df_3d_mean, x='number_of_trees', y='max_depth', z='score', color='score')
fig.show()

It appears that there is a peak at max depth of 9.

In [41]:
optimal_max_depth = 9
model = RandomForestClassifier(n_estimators=optimal_number_of_trees, random_state=42, max_depth=optimal_max_depth)

classifier_test("Random Forest: Optimized Number of Trees and Max Depth", model, X_train, y_train, X_test, y_test)

Classifier: Random Forest: Optimized Number of Trees and Max Depth
Training Data Score: 100.00%
Testing Data Score: 98.80%

Confusion Matrix
          Predicted 0  Predicted 1
Actual 0          127            1
Actual 1            2          120

---------------------------------------------------



(1.0, 0.988)

## Ensembling Different Models

In [58]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)

svc = SVC()
svc.fit(X_train, y_train)
svc_pred = svc.predict(X_test)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

et = ExtraTreeClassifier()
et.fit(X_train, y_train)
et_pred = et.predict(X_test)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

pred_df = pd.DataFrame({'Logistic Regression': lr_pred, 'K Nearest Neighbors': knn_pred, 'Support Vector Machine': svc_pred, 'Decision Tree': dt_pred, 'Extra Tree': et_pred, 'Random Forest': rf_pred})
pred_df.head()


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



Unnamed: 0,Logistic Regression,K Nearest Neighbors,Support Vector Machine,Decision Tree,Extra Tree,Random Forest
0,1,1,1,1,1,1
1,1,1,1,1,1,1
2,1,1,1,1,1,1
3,0,0,0,0,0,0
4,0,0,0,0,0,0


In [68]:
# create a column that is True if at least two of the models predicted malignant
ensemble_pred = pd.DataFrame({
    'At least 1': pred_df.sum(axis=1) >= 1,
    'At least 2': pred_df.sum(axis=1) >= 2,
    'At least 3': pred_df.sum(axis=1) >= 3,
    'At least 4': pred_df.sum(axis=1) >= 4,
    'At least 5': pred_df.sum(axis=1) >= 5,
    'At least 6': pred_df.sum(axis=1) >= 6,
    'Actual': y_test})
ensemble_pred.head()

# print confusion matrix for each ensemble vs actual
from sklearn.metrics import confusion_matrix
for col in ensemble_pred.columns[:-1]:
    print(col)
    print(f'Accuracy: {100*sum(ensemble_pred[col] == ensemble_pred["Actual"])/len(ensemble_pred):.2f}%')
    print(pd.DataFrame(confusion_matrix(ensemble_pred['Actual'], ensemble_pred[col]), index=['Actual Benign', 'Actual Malignant'], columns=['Predicted Benign', 'Predicted Malignant']))
    print('-'*60)
    print()

At least 1
Accuracy: 94.00%
                  Predicted Benign  Predicted Malignant
Actual Benign                  113                   15
Actual Malignant                 0                  122
------------------------------------------------------------

At least 2
Accuracy: 97.20%
                  Predicted Benign  Predicted Malignant
Actual Benign                  121                    7
Actual Malignant                 0                  122
------------------------------------------------------------

At least 3
Accuracy: 98.40%
                  Predicted Benign  Predicted Malignant
Actual Benign                  126                    2
Actual Malignant                 2                  120
------------------------------------------------------------

At least 4
Accuracy: 96.00%
                  Predicted Benign  Predicted Malignant
Actual Benign                  127                    1
Actual Malignant                 9                  113
------------------------------

In [64]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
import pydotplus
import six
from sklearn import tree
dotfile = six.StringIO()
i_tree = 0
for tree_in_forest in clf.estimators_:
    if (i_tree <1):        
        tree.export_graphviz(tree_in_forest, out_file=dotfile, feature_names=X_train.columns)
        pydotplus.graph_from_dot_data(dotfile.getvalue()).write_png('Plots/dtree'+ str(i_tree) +'.png')
        i_tree = i_tree + 1