Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

In [49]:
import os
os.chdir('/Users/sophiaperides/Desktop/Thinkful')
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.datasets import fetch_openml
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [33]:
print('Breast Cancer Columns')
br = pd.read_csv('breast_cancer.csv')
br = pd.DataFrame(br)
print(br.columns)

Breast Cancer Columns
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')


In [36]:
br.info()
br.diagnosis.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null object
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non

B    357
M    212
Name: diagnosis, dtype: int64

In [26]:
y = br.diagnosis
X = br.drop(columns=['diagnosis', 'id'])


# The Models

## Multiple Layer Perceptrons



In [68]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, y)
print('MLP Score: ', mlp.score(X, y))
      
print('Cross Validation Scores: ', cross_val_score(mlp, X, y, cv=10))
print('Mean Cross Validation Score: ', cross_val_score(mlp, X, y, cv=10).mean())

MLP Score:  0.9367311072056239
Cross Validation Scores:  [0.87931034 0.89655172 0.92982456 0.94736842 0.92982456 0.9122807
 0.94736842 0.92857143 0.76785714 0.875     ]
Mean Cross Validation Score:  0.923069527266442


In [69]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100))
mlp.fit(X, y)
print('MLP Score: ', mlp.score(X, y))

cross_val = cross_val_score(mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())
 

MLP Score:  0.9332161687170475
Cross Validation Score:  [0.9137931  0.84482759 0.92982456 0.94736842 0.9122807  0.87719298
 0.94736842 0.91071429 0.92857143 0.91071429]
Mean Cross Validation Score:  0.9122655777374471


In [70]:
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100))
mlp.fit(X, y)
print('MLP Score: ', mlp.score(X, y))
      
cross_val = cross_val_score(mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

MLP Score:  0.9209138840070299
Cross Validation Score:  [0.9137931  0.9137931  0.9122807  0.94736842 0.9122807  0.92982456
 0.85964912 0.92857143 0.92857143 0.94642857]
Mean Cross Validation Score:  0.919256114423991


In [71]:
mlp = MLPClassifier(hidden_layer_sizes=(50, 50))
mlp.fit(X, y)
print('MLP Score: ', mlp.score(X, y))
      
cross_val = cross_val_score(mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

MLP Score:  0.9279437609841827
Cross Validation Score:  [0.93103448 0.93103448 0.89473684 0.94736842 0.9122807  0.92982456
 0.9122807  0.92857143 0.85714286 0.875     ]
Mean Cross Validation Score:  0.9119274479301703


In [72]:
mlp = MLPClassifier(hidden_layer_sizes=(1000,))
mlp.fit(X, y)
cross_val = cross_val_score(mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

Cross Validation Score:  [0.94827586 0.89655172 0.92982456 0.92982456 0.87719298 0.92982456
 0.94736842 0.91071429 0.82142857 0.82142857]
Mean Cross Validation Score:  0.9012434102497624


In [73]:
mlp = MLPClassifier(hidden_layer_sizes=(10, 10))
mlp.fit(X, y)
print('MLP Score: ', mlp.score(X, y))
      
cross_val = cross_val_score(mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())



MLP Score:  0.9279437609841827




Cross Validation Score:  [0.93103448 0.84482759 0.9122807  0.94736842 0.92982456 0.89473684
 0.94736842 0.91071429 0.85714286 0.91071429]
Mean Cross Validation Score:  0.9086012444905368




In [56]:
mlp_gsc = GridSearchCV(
        estimator=MLPClassifier(),
        param_grid={
            'hidden_layer_sizes': [(20, 20), (10), (50, 10)],
            'activation': ['identity', 'logistic', 'tanh', 'relu'],
            'learning_rate': ['constant', 'invscaling', 'adaptive'],
            'max_iter': [100, 200, 300]
        },
        cv=10, scoring='accuracy', verbose=1, n_jobs=-1)

mlp_gsc.fit(X_top_features, y)
best_params = mlp_gsc.best_params_
# svr_gsc.best_estimator_
best_mlp = MLPClassifier(hidden_layer_sizes=best_params['hidden_layer_sizes'], activation=best_params["activation"],
                                         learning_rate=best_params["learning_rate"], max_iter=best_params["max_iter"],
                                         verbose=False)

print('Parameters for the best Multiple Layer Perceptron Classifier: ', best_mlp)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  3.2min finished


Parameters for the best Multiple Layer Perceptron Classifier:  MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(50, 10), learning_rate='constant',
              learning_rate_init=0.001, max_iter=300, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)




In [75]:
best_mlp.fit(X, y)
print('MLP Score: ', best_mlp.score(X, y))
      
cross_val = cross_val_score(best_mlp, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

MLP Score:  0.9543057996485061
Cross Validation Score:  [0.94827586 0.93103448 0.9122807  0.92982456 0.98245614 0.9122807
 0.9122807  0.89285714 0.875      0.92857143]
Mean Cross Validation Score:  0.9224861723273703


## Random Forest Classifier

In [80]:
# Updating the target to a numerical variable.
br['diagnosis'] = br['diagnosis'].apply(lambda x: 1 if 'M' else 0)
br.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                         569 non-null int64
diagnosis                  569 non-null int64
radius_mean                569 non-null float64
texture_mean               569 non-null float64
perimeter_mean             569 non-null float64
area_mean                  569 non-null float64
smoothness_mean            569 non-null float64
compactness_mean           569 non-null float64
concavity_mean             569 non-null float64
concave points_mean        569 non-null float64
symmetry_mean              569 non-null float64
fractal_dimension_mean     569 non-null float64
radius_se                  569 non-null float64
texture_se                 569 non-null float64
perimeter_se               569 non-null float64
area_se                    569 non-null float64
smoothness_se              569 non-null float64
compactness_se             569 non-null float64
concavity_se               569 non-

In [77]:
# Create a random forest classifier
rfc = ensemble.RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
rfc.fit(X, y)
# Train the classifier
cross_val = cross_val_score(rfc, X, y, cv=10)
print('Cross Validation Score: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())
print('\n Feature Importances: ', rfc.feature_importances_)

Cross Validation Score:  [0.98275862 0.9137931  0.94736842 0.96491228 0.98245614 0.98245614
 0.92982456 0.98214286 0.96428571 1.        ]
Mean Cross Validation Score:  0.9649997839426151

 Feature Importances:  [0.02429854 0.01419575 0.0539929  0.04540104 0.00795465 0.00345576
 0.07186229 0.08852713 0.00558817 0.0021412  0.01900904 0.00490146
 0.01459376 0.0394196  0.00374584 0.00366822 0.00424091 0.00508478
 0.00371126 0.0059118  0.12198315 0.0191726  0.17398717 0.07498058
 0.01269208 0.01023589 0.03039391 0.11858904 0.00796357 0.0082979 ]


In [78]:
# Create a random forest classifier with all variables with >0.5 feature importance. 
rfc = ensemble.RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
X_top_features = X[['radius_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_se', 'radius_worst', 'texture_worst', 'smoothness_worst']]
rfc.fit(X_top_features, y)
# Train the classifier
cross_val = cross_val_score(rfc, X_top_features, y, cv=10)
print('Cross Validation Scores: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

Cross Validation Scores:  [0.96551724 0.9137931  0.92982456 0.94736842 1.         0.98245614
 0.98245614 0.98214286 0.94642857 0.94642857]
Mean Cross Validation Score:  0.9596415607985481


In [53]:


rfc_gsc = GridSearchCV(
        estimator=ensemble.RandomForestClassifier(),
        param_grid={
            'max_depth': [50, 100],
            'max_features': [2, 3, 4],
            'min_samples_leaf': [2, 3, 4],
            'min_samples_split': [8, 10, 12],
            'n_estimators': [50, 100, 500]
        },
        cv=10, scoring='accuracy', verbose=1, n_jobs=-1)

rfc_gsc.fit(X_top_features, y)
best_params = rfc_gsc.best_params_
# svr_gsc.best_estimator_
best_rfc = ensemble.RandomForestClassifier(max_depth=best_params['max_depth'], max_features=best_params["max_features"],
                                         min_samples_leaf=best_params["min_samples_leaf"], min_samples_split=best_params["min_samples_split"],
                                         n_estimators=best_params['n_estimators'],verbose=False)

print('Parameters for the best Random Forest Classifier: ', best_rfc)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1620 out of 1620 | elapsed:  6.3min finished


Parameters for the best Random Forest Classifier:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=False, warm_start=False)


In [79]:
rfc = ensemble.RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=50, max_features=2, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=8,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=False, warm_start=False)
rfc.fit(X_top_features, y)
# Train the classifier
cross_val = cross_val_score(rfc, X_top_features, y, cv=10)
print('Cross Validation Scores: ', cross_val)
print('Mean Cross Validation Score: ', cross_val.mean())

Cross Validation Scores:  [0.96551724 0.89655172 0.92982456 0.94736842 1.         0.96491228
 0.94736842 0.96428571 0.92857143 0.96428571]
Mean Cross Validation Score:  0.9508685506870623


Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!