### Goal:

Predict whether the cancer is benign or malignant. 

### Short Description:

"Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image."

UCI ML Repository Link: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import xgboost
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
import os

def encodeBinaryLabel(val, one_val):
    if pd.isna(val):
        raise ValueError('Null value found!')
    else:
        if val == one_val:
            return 1
        else:
            return 0

def evaluateBinaryClassifier(x_array, y_array, clf_model, use_cross_val=False, folds = 5):
    try:
        if use_cross_val == False:
            clf_y_pred = clf_model.predict(x_array)
        else:
            clf_y_pred = cross_val_predict(clf_model, x_array, y_array, cv = folds)
    except Exception as e:
        print(e)
        print("An error occurred while trying to execute the classification model's predict method.")
    try:
        conf_mtrx = confusion_matrix(y_array, clf_y_pred)
        precision = precision_score(y_array, clf_y_pred)
        recall = recall_score(y_array, clf_y_pred)
    except Exception as e:
        print(e)
        print("An error occurred while calling the metric methods.")
    eval_dict = {'conf_mtrx': conf_mtrx, 'precision': precision, 'recall': recall}
    return eval_dict

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

df.columns = df.columns.str.lower().str.replace(' ', '_')

print('count(*): ' + str(len(df.index)) + '\n')

sns.countplot(x = 'diagnosis', data = df)
plt.title('Counts by Diagnosis')
plt.show()

## Binary Classification

I'm just starting out with random forest and stochastic gradient descent. The model with the best evaluation results will be fitted to the whole training dataset then compared to results obtained from ensembling methods. 

#### Data Prep

In [None]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        print(str(col) + ' null count: ' + str(df[col].isnull().sum()))

# check for duplicate id's
print("Row count matches distinct count of id's?", len(df.index) == len(df['id'].unique()))

# convert diagnosis values to binary
df['labels'] = df['diagnosis'].apply(lambda x: encodeBinaryLabel(x, 'M'))

df = df.drop(labels = 'unnamed:_32', axis = 1)


In [None]:
x = df.drop(labels=['id', 'diagnosis', 'labels'], axis = 1).values
y = df['labels'].values

# separate data into train and test sets
seed = 7
np.random.seed(seed)
x_train, x_test_val, y_train, y_test_val = train_test_split(x, y, test_size=0.3, random_state=seed)
x_test, x_val, y_test, y_val = train_test_split(x_test_val, y_test_val, test_size = 0.5, random_state=seed)

#### Binary Classification Models

In [None]:
# Random Forest
rf_clf = RandomForestClassifier(random_state=42)

# Stochastic Gradient Descent
sgd_clf = SGDClassifier(random_state = 42)

# Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=3000)

# SVM
sup_vec = SVC(random_state=42)

print('---- Random Forest ----')
rf_scores = evaluateBinaryClassifier(x_array=x_train, y_array=y_train, clf_model=rf_clf, use_cross_val=True, folds=3)
for key, value in rf_scores.items():
    print(str(key) + ': \n', value)

print('---- Stochastic Gradient Descent ----')
sgd_scores = evaluateBinaryClassifier(x_array=x_train, y_array=y_train, clf_model=sgd_clf, use_cross_val=True, folds=3)
for key, value in sgd_scores.items():
    print(str(key) + ': \n', value)

print('---- Logistic Regression ----')
log_reg_scores = evaluateBinaryClassifier(x_array=x_train, y_array=y_train, clf_model=log_reg, use_cross_val=True, folds=3)
for key, value in log_reg_scores.items():
    print(str(key) + ': \n', value)

print('---- SVM Classifier ----')
sup_vec_scores = evaluateBinaryClassifier(x_array=x_train, y_array=y_train, clf_model=sup_vec, use_cross_val=True, folds=3)
for key, value in sup_vec_scores.items():
    print(str(key) + ': \n', value)

In [None]:
rf_clf.fit(x_train, y_train)
rf_y_val_pred = rf_clf.predict(x_val)

log_reg.fit(x_train, y_train)
lr_y_val_pred = log_reg.predict(x_val)

## Utilizing Ensemble Methods

#### Adaboost

In [None]:
ada_clf = AdaBoostClassifier(RandomForestClassifier(), n_estimators=200, learning_rate=0.5)

ada_clf.fit(x_train, y_train)

In [None]:
ada_y_val_pred = ada_clf.predict(x_val)

print('------- AdaBoost -------')
print('Confusion Matrix: \n', confusion_matrix(y_val, ada_y_val_pred))
print('Precision:', precision_score(y_val, ada_y_val_pred))
print('Recall:', recall_score(y_val, ada_y_val_pred))

#### XGBoost

In [None]:
xg_rf = xgboost.XGBRFClassifier()
xg_rf.fit(x_train, y_train)

xg_y_val_pred = xg_rf.predict(x_val)

## Conclusion

In [None]:
print('Random Forest: ', f1_score(y_val, rf_y_val_pred))
print('Logistic Regression: ', f1_score(y_val, lr_y_val_pred))
print('AdaBoost w/ RF: ', f1_score(y_val, ada_y_val_pred))
print('XGBoost RF: ', f1_score(y_val, xg_y_val_pred))

In [None]:
ada_y_test_pred = ada_clf.predict(x_test)
rf_y_test_pred = rf_clf.predict(x_test)
lr_y_test_pred = log_reg.predict(x_test)

print('Random Forest: ', f1_score(y_test, rf_y_test_pred))
print('Logistic Regression: ', f1_score(y_test, lr_y_test_pred))
print('AdaBoost w/ RF: ', f1_score(y_test, ada_y_test_pred))

#### Exporting Model

In [None]:
# exporting model 
import pickle
# preferred for sklearn
from joblib import dump, load
from datetime import datetime

current_dt = datetime.today().strftime('%Y-%m-%d').replace('-', '')

if os.path.isfile('/kaggle/working/ada_breast_cancer_clf_{today}.joblib'.format(today = current_dt)) != True:
    dump(ada_clf, '/kaggle/working/ada_breast_cancer_clf_{today}.joblib'.format(today = current_dt))
else:
    print('Export already exists!')


In [None]:
! ls -l *.joblib

In [None]:
# quick example of how we could use model for later
old_model = load('/kaggle/working/ada_breast_cancer_clf_{today}.joblib'.format(today = current_dt))

old_model.predict(x_test)