In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
cancer = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
cancer.head()

Examine and Clean the Dataset

In [None]:
cancer.shape

In [None]:
cancer.info()

In [None]:
cancer['Unnamed: 32'].isnull().sum() #delete this column as all null values

In [None]:
#drop id and Unnamed: 32 columns
cancer.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
cancer.head()

In [None]:
cancer.dtypes #will need to encode diagnosis column values

In [None]:
cancer['diagnosis'].value_counts()

In [None]:
sns.countplot(cancer['diagnosis'], label = 'count of diagnoses')

In [None]:
#encode diagnosis column values
from sklearn.preprocessing import LabelEncoder
labelencoder_Y = LabelEncoder()
cancer.iloc[:,0] = labelencoder_Y.fit_transform(cancer.iloc[:,0].values)
cancer.head()

EDA

In [None]:
#lets look at correlation
sns.pairplot(cancer, hue='diagnosis')

In [None]:
cancer.corr()

In [None]:
plt.figure(figsize=(20,20)) #make heatmap biger
sns.heatmap(cancer.corr(), annot=True, fmt='.0%')

Prepare Data

In [None]:
#split data into X (features) and Y (labels)
X = cancer.drop(['diagnosis'], axis=1)
Y = cancer['diagnosis']

In [None]:
#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

In [None]:
#Feature scaling
#from sklearn.preprocessing import StandardScaler
#sc = StandardScaler()
#X_train = sc.fit_transform(X_train)
#X_test = sc.fit_transform(X_test)

HyperParameter Tuning and Model Selection

In [None]:
from sklearn.model_selection import GridSearchCV
import joblib
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [None]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0) 
parameters = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

cv = GridSearchCV(lr, parameters, cv=5)
cv.fit(X_train, Y_train)

print_results(cv)

In [None]:
cv.best_estimator_

In [None]:
joblib.dump(cv.best_estimator_, '../../../LR_model.pkl')

In [None]:
from sklearn.svm import SVC
svc = SVC()
parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10]
}

cv = GridSearchCV(svc, parameters, cv=5)
cv.fit(X_train, Y_train)

print_results(cv)

In [None]:
cv.best_estimator_

In [None]:
joblib.dump(cv.best_estimator_, '../../../SVM_model.pkl')

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
parameters = {
    'n_estimators': [5, 50, 250],
    'max_depth': [2, 4, 8, 16, 32, None]
}

cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(X_train, Y_train)

print_results(cv)

In [None]:
joblib.dump(cv.best_estimator_, '../../../RF_model.pkl')

In [None]:
from sklearn.neural_network import MLPRegressor, MLPClassifier

mlp = MLPClassifier()
parameters = {
    'hidden_layer_sizes': [(10,), (50,), (100,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'learning_rate': ['constant', 'invscaling', 'adaptive']
}

cv = GridSearchCV(mlp, parameters, cv=5)
cv.fit(X_train, Y_train)

print_results(cv)

In [None]:
cv.best_estimator_

In [None]:
joblib.dump(cv.best_estimator_, '../../../MLP_model.pkl')

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

gb = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.1, 1, 10, 100]
}

cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(X_train, Y_train)

print_results(cv)

In [None]:
cv.best_estimator_

In [None]:
joblib.dump(cv.best_estimator_, '../../../GB_model.pkl')

In [None]:
#from sklearn.naive_bayes import GaussianNB
#gnb = GaussianNB()
#gnb.fit(X_train, Y_train)

#from sklearn.tree import DecisionTreeClassifier
#dtc = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
#dtc.fit(X_train, Y_train)

#from sklearn.neighbors import KNeighborsClassifier
#knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
#knn.fit(X_train, Y_train)

Read in Models

In [None]:
models = {}

for mdl in ['LR', 'SVM', 'MLP', 'RF', 'GB']:
    models[mdl] = joblib.load('../../../{}_model.pkl'.format(mdl))

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time

def evaluate_model(name, model, features, labels):
    start = time()
    end = time()
    pred = model.predict(features)
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred), 3)
    recall = round(recall_score(labels, pred), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

In [None]:
for name, mdl in models.items():
    evaluate_model(name, mdl, X_train, Y_train)

Evaluate Best Model on Test Set

In [None]:
evaluate_model('svc', models['SVM'], X_test, Y_test)

In [None]:
#check whether RF and GB are overfitting
evaluate_model('rf', models['RF'], X_test, Y_test)

In [None]:
evaluate_model('gb', models['GB'], X_test, Y_test)