In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import KFold # This library will help split our data into KFolds
from scipy.stats import randint
from sklearn.model_selection import KFold 
from sklearn.tree import DecisionTreeClassifier #Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import time

import statsmodels.api as sm 



# Data: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+%28Diagnostic%29
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv') 

# Check shape of dataframe
df.shape

#lets check the data info
df.info()

In [None]:
# Checcking for ids that show up multiple times in the dataset.
# This could skew any results
counts = pd.DataFrame(df.id.value_counts())
counts = counts[counts.id > 1]

duplicates = list(counts.index)
print(f'Duplicate IDs: {len(duplicates)}')

df = df[~df.id.isin(duplicates)]

In [None]:
# Drop 'id' column that is no longer necessary.
df.drop(columns=['Unnamed: 32', 'id'], inplace=True)
print(df.columns)

In [None]:
# Change the "malignant" diagnosis column from 2/4 to 0/1 for benign/malignant
df.diagnosis = [1 if i == 'M' else 0 for i in df.diagnosis]


df.info()


In [None]:
#lets describe our dataset
df.describe()

In [None]:
#lets check for the distribution of our features
df.radius_mean.plot.hist()

In [None]:
df.texture_mean.plot.hist()


In [None]:
df.perimeter_mean.plot.hist()

In [None]:
#What are their counts?
df.diagnosis.value_counts()

In [None]:
# Split data into training and holdout set
X = df.drop('diagnosis', axis = 1)
y = df.diagnosis


# We would be splitting dataset into train and test, then using the train set for cross validation
# Initial train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)


In [None]:
# # Logistics Regression

start = time.time()
#instead of tunnibg manually, i would use the gridsearchcv to tune for my best paramentrs
estimator = Pipeline( [('scale', StandardScaler()),
                      ('clf', LogisticRegression(penalty = 'l1', solver = 'liblinear', random_state = 1))] )
logit_grid = {'clf__C': np.logspace(-2, 1, 20),
       'clf__max_iter': np.linspace(1000,10000,10)}

model_logit = GridSearchCV(estimator, logit_grid, cv = 10, scoring = 'accuracy', n_jobs = -1)
model_logit.fit(X_train, y_train)

end = time.time()
Ttime = end - start
print(Ttime)

model_logit.best_params_



In [None]:
start = time.time()
# lets tune to obtain the best threshold that would predict my our model      
thresholds=np.linspace(0,1,50)
#Remove the first and last position in the threshold candidate values:
thresholds=np.delete(thresholds,[0,49])
FPR_list=np.zeros(len(thresholds))
Accuracy_list=np.zeros(len(thresholds))
Youden_list=np.zeros(len(thresholds))
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1) # Think of this as instantiating the folds

#Looping through values in the thresholds and folds:
for train_index, valid_index in kfold.split(X_train):
    X_training, y_training=X_train.iloc[train_index], y_train.iloc[train_index]
    X_valid, y_valid=X_train.iloc[valid_index], y_train.iloc[valid_index]
    model_logit.fit(X_training, y_training)
    pred_prob=model_logit.predict_proba(X_valid)
    for i,value in enumerate(thresholds):
        #For each value of threshol
        yhat=np.where(pred_prob[:,1] > value, 1, 0)
        confmat=confusion_matrix(y_valid, yhat, labels=[1,0])
        TP = confmat[0,0]
        FN = confmat[0,1]
        FP = confmat[1,0]
        TN = confmat[1,1]
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
        #Calculate FPR
        FPR = FP / (FP + TN) # False Positive Rate
        #Calculate accuracy
        Acc = (TP + TN) / sum(sum(confmat))
        #Calculate Youden index
        Youden=sensitivity + specificity -1
        FPR_list[i]=FPR_list[i] + FPR
        Accuracy_list[i]=Accuracy_list[i] +Acc
        Youden_list[i]=Youden_list[i] + Youden      
        

#Find the threshold values that satisfy the conditions:      
thresh_1 =thresholds[np.argmin(FPR_list)]
thresh_2 =thresholds[np.argmax(Accuracy_list)]
thresh_3 =thresholds[np.argmax(Youden_list)]

end = time.time()
Ttime = end - start
print(Ttime)

In [None]:
#building our confusion matrix using the youden tunes the best
y_prob_logit = model_logit.predict_proba(X_test)[:,1]
#Minimize FPR threshold value:
yhat_logit = np.where(y_prob_logit > thresh_3,1,0)
confmat_logit = confusion_matrix(y_test,yhat_logit,labels=[1,0])
confmat_logit

In [None]:
#Random forest
#lets tune for a randon forest model, i was going to tune with boostrap but this prooved to long, so i would use the default
start = time.time()

rf_grid = {'n_estimators': np.linspace(100, 500, 5, dtype = int), 
          'max_leaf_nodes': np.arange(5, 10),
           'min_samples_leaf' : range(2, 10),
          'max_features': ['auto','sqrt']}
model_RF = GridSearchCV(RandomForestClassifier(random_state = 1),
                    param_grid = rf_grid, cv = 10, n_jobs = -1, scoring = 'accuracy')
model_RF.fit(X_train, y_train)

model_RF.best_params_

end = time.time()
Ttime = end - start
print(Ttime)

In [None]:
# In[23]:
start = time.time()
# lets tune to obtain the best threshold that would predict my our model      
thresholds=np.linspace(0,1,50)
#Remove the first and last position in the threshold candidate values:
thresholds=np.delete(thresholds,[0,49])
FPR_list=np.zeros(len(thresholds))
Accuracy_list=np.zeros(len(thresholds))
Youden_list=np.zeros(len(thresholds))
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1) # Think of this as instantiating the folds

#Looping through values in the thresholds and folds:
for train_index, valid_index in kfold.split(X_train):
    X_training, y_training=X_train.iloc[train_index], y_train.iloc[train_index]
    X_valid, y_valid=X_train.iloc[valid_index], y_train.iloc[valid_index]
    model_logit.fit(X_training, y_training)
    pred_prob=model_logit.predict_proba(X_valid)
    for i,value in enumerate(thresholds):
        #For each value of threshol
        yhat=np.where(pred_prob[:,1] > value, 1, 0)
        confmat=confusion_matrix(y_valid, yhat, labels=[1,0])
        TP = confmat[0,0]
        FN = confmat[0,1]
        FP = confmat[1,0]
        TN = confmat[1,1]
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
        #Calculate FPR
        FPR = FP / (FP + TN) # False Positive Rate
        #Calculate accuracy
        Acc = (TP + TN) / sum(sum(confmat))
        #Calculate Youden index
        Youden=sensitivity + specificity -1
        FPR_list[i]=FPR_list[i] + FPR
        Accuracy_list[i]=Accuracy_list[i] +Acc
        Youden_list[i]=Youden_list[i] + Youden     

end = time.time()
Ttime = end - start
print(Ttime)

#Find the threshold values that satisfy the conditions:      
threshRF_1 =thresholds[np.argmin(FPR_list)]
threshRF_2 =thresholds[np.argmax(Accuracy_list)]
threshRF_3 =thresholds[np.argmax(Youden_list)]



In [None]:
# In[24]:
#building our confusion matrix 
y_prob_RF = model_RF.predict_proba(X_test)[:,1]
#Minimize FPR threshold value:
yhat_RF = np.where(y_prob_RF > threshRF_2,1,0)
confmat_RF = confusion_matrix(y_test,yhat_RF,labels=[1,0])
confmat_RF

In [None]:
#bagging
start = time.time()
# now lets tune for a bagged tree with three parameters
bag_grid = {'n_estimators': np.linspace(100,500,5, dtype = int),
           'max_depth':range(2, 10),
           'min_samples_leaf' : range(2, 15)}

model_bag = GridSearchCV(RandomForestClassifier(max_features = None, random_state = 1), 
                      param_grid = bag_grid, cv = 10, n_jobs = -1, scoring = 'accuracy')
model_bag.fit(X_train, y_train)

model_bag.best_params_

end = time.time()
Ttime = end - start
print(Ttime)

In [None]:
start = time.time()
# lets tune to obtain the best threshold that would predict my our model      
thresholds=np.linspace(0,1,50)
#Remove the first and last position in the threshold candidate values:
thresholds=np.delete(thresholds,[0,49])
FPR_list=np.zeros(len(thresholds))
Accuracy_list=np.zeros(len(thresholds))
Youden_list=np.zeros(len(thresholds))
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1) # Think of this as instantiating the folds

#Looping through values in the thresholds and folds:
for train_index, valid_index in kfold.split(X_train):
    X_training, y_training=X_train.iloc[train_index], y_train.iloc[train_index]
    X_valid, y_valid=X_train.iloc[valid_index], y_train.iloc[valid_index]
    model_bag.fit(X_training, y_training)
    pred_prob=model_bag.predict_proba(X_valid)
    for i,value in enumerate(thresholds):
        #For each value of threshol
        yhat=np.where(pred_prob[:,1] > value, 1, 0)
        confmat=confusion_matrix(y_valid, yhat, labels=[1,0])
        TP = confmat[0,0]
        FN = confmat[0,1]
        FP = confmat[1,0]
        TN = confmat[1,1]
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
        #Calculate FPR
        FPR = FP / (FP + TN) # False Positive Rate
        #Calculate accuracy
        Acc = (TP + TN) / sum(sum(confmat))
        #Calculate Youden index
        Youden=sensitivity + specificity -1
        FPR_list[i]=FPR_list[i] + FPR
        Accuracy_list[i]=Accuracy_list[i] +Acc
        Youden_list[i]=Youden_list[i] + Youden     

end = time.time()
Ttime = end - start
print(Ttime)

#Find the threshold values that satisfy the conditions:      
threshbag_1 =thresholds[np.argmin(FPR_list)]
threshbag_2 =thresholds[np.argmax(Accuracy_list)]
threshbag_3 =thresholds[np.argmax(Youden_list)]

In [None]:
#building our confusion matrix 
y_prob_bag = model_bag.predict_proba(X_test)[:,1]
#Minimize FPR threshold value:
yhat_bag =np.where(y_prob_bag > threshbag_1,1,0)
confmat_bag = confusion_matrix(y_test,yhat_bag,labels=[1,0])
confmat_bag    

In [None]:
#using the Boosted Tree, tunning using the three parameters for gradient boosting
start = time.time()

gboost_grid = {'n_estimators': np.linspace(100, 1000, 10, dtype = int), 'max_depth': [1,2,3,4],
               'learning_rate': np.arange(0.01, 0.1, 0.01)}
model_gboost = GridSearchCV(GradientBoostingClassifier(min_samples_leaf = 10, random_state = 1),
                    param_grid = gboost_grid, cv = 10, n_jobs = -1, scoring = 'accuracy')
model_gboost.fit(X_train, y_train)

model_gboost.best_params_
end = time.time()
Ttime = end - start
print(Ttime)

# In[48]:

start = time.time()
# lets tune to obtain the best threshold that would predict my our model      
thresholds=np.linspace(0,1,50)
#Remove the first and last position in the threshold candidate values:
thresholds=np.delete(thresholds,[0,49])
FPR_list=np.zeros(len(thresholds))
Accuracy_list=np.zeros(len(thresholds))
Youden_list=np.zeros(len(thresholds))
kfold = KFold(n_splits = 5, shuffle = True, random_state = 1) # Think of this as instantiating the folds

#Looping through values in the thresholds and folds:
for train_index, valid_index in kfold.split(X_train):
    X_training, y_training=X_train.iloc[train_index], y_train.iloc[train_index]
    X_valid, y_valid=X_train.iloc[valid_index], y_train.iloc[valid_index]
    model_gboost.fit(X_training, y_training)
    pred_prob=model_gboost.predict_proba(X_valid)
    for i,value in enumerate(thresholds):
        #For each value of threshol
        yhat=np.where(pred_prob[:,1] > value, 1, 0)
        confmat=confusion_matrix(y_valid, yhat, labels=[1,0])
        TP = confmat[0,0]
        FN = confmat[0,1]
        FP = confmat[1,0]
        TN = confmat[1,1]
        sensitivity = TP / (TP + FN)
        specificity = TN / (TN + FP)
        #Calculate FPR
        FPR = FP / (FP + TN) # False Positive Rate
        #Calculate accuracy
        Acc = (TP + TN) / sum(sum(confmat))
        #Calculate Youden index
        Youden=sensitivity + specificity -1
        FPR_list[i]=FPR_list[i] + FPR
        Accuracy_list[i]=Accuracy_list[i] +Acc
        Youden_list[i]=Youden_list[i] + Youden      
        
end = time.time()
Ttime = end - start
print(Ttime)

#Find the threshold values that satisfy the conditions:      
threshgb_1 =thresholds[np.argmin(FPR_list)]
threshgb_2 =thresholds[np.argmax(Accuracy_list)]
threshgb_3 =thresholds[np.argmax(Youden_list)]

# In[48]:
#building our confusion matrix using yoden threshold
y_prob_gboost = model_gboost.predict_proba(X_test)[:,1]
#Minimize FPR threshold value:
yhat_gboost =np.where(y_prob_gboost > threshgb_3,1,0)
confmat_gboost = confusion_matrix(y_test,yhat_gboost,labels=[1,0])
confmat_gboost

In [None]:
# #### check for accuracy on the train and test set?

print(f'Train Accuracy for logit model : {model_logit.score(X_train,y_train):.4f}')
print (f'Test Accuracy for logist model - : {model_logit.score(X_test,y_test):.4f}')


print(f'Train Accuracy for bagged model : {model_bag.score(X_train,y_train):.4f}')
print (f'Test Accuracy for bagged model - : {model_bag.score(X_test,y_test):.4f}')

print(f'Train Accuracy for Random Forest model : {model_RF.score(X_train,y_train):.4f}')
print (f'Test Accuracy for Randon Forest model - : {model_RF.score(X_test,y_test):.4f}')

print(f'Train Accuracy for Boosting model : {model_gboost.score(X_train,y_train):.4f}')
print (f'Test Accuracy for Boosting model - : {model_gboost.score(X_test,y_test):.4f}')

In [None]:
#lets check for the auc of all the models

# AUC values
print(roc_auc_score(y_test, y_prob_logit))
print(roc_auc_score(y_test, y_prob_tree))
print(roc_auc_score(y_test, y_prob_RF))
print(roc_auc_score(y_test, y_prob_gboost))

In [None]:
#lets check for the classification of all the models

# classification report
print("classification report for logistics model")
print(classification_report(y_test, yhat_logit))

print('classification report of random forest model is')
print(classification_report(y_test, yhat_RF))

print('classification report for bagged tree') 
print(classification_report(y_test, yhat_bag))
      
print('classification report for gradient boosting is')
print(classification_report(y_test, yhat_gboost))


In [None]:
# Important features
importance = pd.DataFrame({'feature':X.columns.values,  'importance_2': model_bag.best_estimator_.feature_importances_, 
                           'importance_3': model_RF.best_estimator_.feature_importances_,
                           'importance_4': model_gboost.best_estimator_.feature_importances_ })
importance.sort_values(by = ['importance_2'], ascending = False)