In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import auc, accuracy_score, mean_squared_error
from sklearn.impute  import KNNImputer
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from time import time
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')

In [68]:
data = pd.read_csv('../data/Breast_Cancer.csv')
print(f'Original dataset: {data.shape}')

def check_binary(df):
    return [column for column in df.columns if df[column].isin(['Yes','No']).any()]

def check_pseudobi(df):
    return [column for column in df.columns if df[column].isin(np.arange(0.0, 5.0)).any()]

def check_marital(df):
    return [column for column in df.columns if df[column].isin(np.unique(data['marital'])).any()]

def getcolumns_rate(dataframe, rate = 0.10): 
  less_than_50pct_nonnull = (dataframe.isnull().sum() / dataframe.shape[0]) < rate
  return dataframe.columns[less_than_50pct_nonnull]

def get_objectColumns(dataframe, type = 'object'):
    return list(dataframe.select_dtypes(include = type).columns)

def variance_inflation(df):
    vif = pd.DataFrame()
    vif['VIF'] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    vif['Features'] = data.columns
    vif.sort_values(by = 'VIF', ascending = False)

def get_columnsnull(df):
    null_columns = df.columns[df.isnull().any()]
    return null_columns.tolist()

def display_scores(scores):
    print("Scores: {0}\nMean: {1:.3f}\nStd: {2:.3f}".format(scores, np.mean(scores), np.std(scores)))

def impute_knn(df, column_name, n_neighbors = 5):
    df_imputed = df.copy()
    imputer = KNNImputer(n_neighbors=n_neighbors)
    imputer.fit(df_imputed[[column_name]])
    df_imputed[column_name] = imputer.transform(df_imputed[[column_name]])
    return df_imputed

def impute_average(df, columns, impute_type = 'median'):
    df_imputed = df.copy()
    for col in columns:
        if impute_type == 'mean':
            impute_val = df_imputed[col].mean()
        elif impute_type == 'median':
            impute_val = df_imputed[col].median()
        else:
            raise ValueError('Invalid imputation type')
        df_imputed[col].fillna(impute_val, inplace=True)
    return df_imputed

def report_best_scores(results, n_top = 3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate], results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

remove_columns = getcolumns_rate(data)
data = data[remove_columns]
# data = data.dropna()
encoder = LabelEncoder()

data['bdate'] = data['bdate'].str[-4:]
data['age']   = 2565 - data['bdate'].astype('int')
data = data.drop('bdate', axis = 1)

encode_columns = get_objectColumns(data)
for column in encode_columns:
    data[column] = encoder.fit_transform(data[column])

data['noova'] = data['noova'].replace([9.0], [3.0], inplace = False)
encode_columns = ['nobreast', 'nosecon', 'noova', 'inj', 'noparity']
for column in encode_columns:
    data[column] = encoder.fit_transform(data[column])

data = impute_knn(data, 'agemen')
data = impute_knn(data, 'dur_brefed')
data = impute_average(data, ['weight'], 'median')
data = impute_average(data, ['height'], 'median')


print(f'Preprocessed dataset: {data.shape}')
X = data.drop('diag_cancer', axis = 1).to_numpy()
y = data['diag_cancer'].to_numpy()

Original dataset: (15718, 46)
Preprocessed dataset: (15718, 36)


In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
smote = SMOTE(sampling_strategy = 'minority', k_neighbors = 10, random_state = 42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [76]:
def model_train(model):
    start = time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    class_names = ['Cancer', 'No Cancer']
    
    confusion = confusion_matrix(y_test, y_pred)
    
    print("Accuracy Score: ", np.round(accuracy_score(y_test, y_pred), 3))
    print('Confusion Matrix : \n', confusion)
    
    print(classification_report(y_test, y_pred, target_names = class_names))
    print(f"\t Fit and predict time: {np.round(time() - start, 3)} seconds")

In [80]:
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha'   : np.arange(0.001, 0.1, 0.01),
              'max_depth'   : np.arange(1, 5),
              'criterion'   : ['gini', 'entropy']}

tree_class = DecisionTreeClassifier(max_depth = 5, random_state = 1024)
grid_search = GridSearchCV(estimator = tree_class, 
                           param_grid = param_grid, 
                           cv = 5, 
                           verbose = True,
                           scoring = 'accuracy')
grid_search = grid_search.fit(X_train, y_train)

adaboost_dct = AdaBoostClassifier(grid_search.best_estimator_, 
                                  n_estimators  = 10, 
                                  random_state  = 42, 
                                  learning_rate = 0.001)
model_train(adaboost_dct)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Accuracy Score:  0.984
Confusion Matrix : 
 [[  21    6]
 [  69 4620]]
              precision    recall  f1-score   support

      Cancer       0.23      0.78      0.36        27
   No Cancer       1.00      0.99      0.99      4689

    accuracy                           0.98      4716
   macro avg       0.62      0.88      0.68      4716
weighted avg       0.99      0.98      0.99      4716

	 Fit and predict time: 0.478 seconds


In [78]:
param_grid = {'learning_rate': [0.1, 0.01, 0.001, 0.0001],
              'subsample': [1.0, 0.5],
              'max_features': np.arange(0, 10), 
              'max_depth': np.arange(0, 5)}

gradient_boosting = GradientBoostingClassifier(n_estimators = 10, 
                                 learning_rate = 0.1, 
                                 random_state = 0)

grid_search = GridSearchCV(estimator = gradient_boosting, 
                           param_grid = param_grid, 
                           cv = 5, 
                           verbose = True,
                           scoring = 'accuracy')
model_train(grid_search)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
Accuracy Score:  0.988
Confusion Matrix : 
 [[  10   17]
 [  40 4649]]
              precision    recall  f1-score   support

      Cancer       0.20      0.37      0.26        27
   No Cancer       1.00      0.99      0.99      4689

    accuracy                           0.99      4716
   macro avg       0.60      0.68      0.63      4716
weighted avg       0.99      0.99      0.99      4716

	 Fit and predict time: 257.308 seconds


In [79]:
xgboost = xgb.XGBClassifier(n_estimators = 10, 
                            objective = 'binary:logistic',
                            reg_lambda = 0,
                            gamma  = 1,
                            max_depth = 6,
                            eta = 0.3)
xgboost.fit(X_train, y_train)
model_train(xgboost)

Accuracy Score:  0.994
Confusion Matrix : 
 [[  21    6]
 [  22 4667]]
              precision    recall  f1-score   support

      Cancer       0.49      0.78      0.60        27
   No Cancer       1.00      1.00      1.00      4689

    accuracy                           0.99      4716
   macro avg       0.74      0.89      0.80      4716
weighted avg       1.00      0.99      0.99      4716

	 Fit and predict time: 0.611 seconds
