# Breast Cancer Prediction Model

## Importing primary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import plotly.express as px
import pandas_profiling as pp
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
df

In [None]:
df.info()

In [None]:
round(df.isna().sum() * 100 / len(df) , 2).sort_values(ascending = False)

In [None]:
df.drop(["Unnamed: 32" , "id"] , axis = 1 , inplace = True)

## Basic Preprocessing and EDA

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df["diagnosis"] = lb.fit_transform(df["diagnosis"])

In [None]:
plt.figure(figsize = (15 , 6))
df.dtypes.value_counts().plot.pie(explode=[0.3,0.3] , autopct='%1.2f%%' , shadow=True)

In [None]:
sns.countplot(data = df , x = "diagnosis" )

In [None]:
plt.figure(figsize = (20 , 10))
sns.heatmap(df.corr() , annot = True , cmap = "coolwarm")

### Dropping Highly correlated columns / Features 

In [None]:
df.drop(["perimeter_mean" , "perimeter_worst"] , axis = 1 , inplace = True)

In [None]:
df.hist(edgecolor = "black" , figsize = (15 , 15));

## Outliers Treatment
1. Skewness in the range of [-3 , 3]
2. Kurtosis in the range of [-1 , 10]

In [None]:
for i in range(len(df.skew())):
    if df.skew()[i] > 3 or df.skew()[i] < -3:
        print(f"{df.skew().index[i]} with skewness of {df.skew()[i] : >{20}}")
        print("\n")
        plt.figure(figsize = (15 , 6))
        sns.histplot(data = df , x = df.columns[i] , hue = "diagnosis" , kde = True)
        plt.show()
        print("\n\n")

In [None]:
for i in range(len(df.kurtosis())):
    if df.kurtosis()[i] > 10 or df.kurtosis()[i] < -10:
        print(f"{df.kurtosis().index[i]} with kurtosis of {df.kurtosis()[i] : >{20}}")
        print("\n")
        plt.figure(figsize = (15 , 6))
        sns.histplot(data = df , x = df.columns[i] , hue = "diagnosis" , kde = True)
        plt.show()
        print("\n\n")

There are outliers in the above columns

In [None]:
df_temp = df.copy()

### Treating Outliers

In [None]:
df["diagnosis"].value_counts()

In [None]:
# Percentile Cutoff method

outs = ["radius_se" , "perimeter_se" , "area_se" , "smoothness_se" , "concavity_se" , "fractal_dimension_se"]

for i in outs:
    df[i].loc[df[i] < np.percentile(df[i] , [1])[0] * 0.3] = np.percentile(df[i] , [1])[0]
    df[i].loc[df[i] > np.percentile(df[i] , [99])[0] * 3] = np.percentile(df[i] , [99])[0]

In [None]:
# Exponential Smothening

for i in outs:
    df[i] = np.log(df[i] + 1)

### Rechecking for outliers after the Treatment

In [None]:
for i in range(len(df.skew())):
    if df.skew()[i] > 3 or df.skew()[i] < -3:
        print(f"{df.skew().index[i]} with skewness of {df.skew()[i] : >{20}}")
        print("\n")
        plt.figure(figsize = (15 , 6))
        sns.histplot(data = df , x = df.columns[i] , hue = "diagnosis" , kde = True)
        plt.show()
        print("\n\n")

In [None]:
for i in range(len(df.kurtosis())):
    if df.kurtosis()[i] > 10 or df.kurtosis()[i] < -10:
        print(f"{df.kurtosis().index[i]} with kurtosis of {df.kurtosis()[i] : >{20}}")
        print("\n")
        plt.figure(figsize = (15 , 6))
        sns.histplot(data = df , x = df.columns[i] , hue = "diagnosis" , kde = True)
        plt.show()
        print("\n\n")

We Removed most of the outliers

### Checking for multicollinearity

In [None]:
from sklearn.feature_selection import mutual_info_classif as mif

mif_values = mif(df.drop(["diagnosis"] , axis = 1) , df["diagnosis"])

pd.DataFrame(mif_values , index = df.drop(["diagnosis"] , axis = 1).columns).sort_values(by = 0 , ascending = False)

Therefore , no issues of multicollinearity

## Train Test Split

In [None]:
X = df.drop(["diagnosis"] , axis = 1)
y = df["diagnosis"]

In [None]:
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.2 , random_state = 42)
len(X_train) , len(X_test) , len(y_train) , len(y_test)

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

ints = X.columns

X_train[ints] = scaler.fit_transform(X_train[ints])
X_test[ints] = scaler.transform(X_test[ints])

## Model Fitting

In [None]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix , roc_auc_score , f1_score , accuracy_score , classification_report , roc_curve , auc , plot_roc_curve
from sklearn.model_selection import cross_val_score

In [None]:
models = []
models.append(("XGBClassifier", XGBClassifier(objective = 'binary:logistic' , random_state = 42 , eval_metric='mlogloss')))
models.append(("CatBoostClassifier", CatBoostClassifier(random_state = 42 , verbose = 0)))
models.append(("RandomForest", RandomForestClassifier(random_state = 42 , n_estimators = 200)))
models.append(("ExtraTreeRegressor", ExtraTreesClassifier(random_state = 42 , n_estimators = 200)))
models.append(("Gradient Boosting Classifier" , GradientBoostingClassifier(random_state = 42)))
models.append(("LightGBM" , LGBMClassifier(random_state = 42 , n_estimators = 200)))
models.append(("Logistic Regression", LogisticRegression(random_state = 42)))
models.append(("KNeigbors", KNeighborsClassifier()))

In [None]:
def metrics(model , X_train , y_train , X_test , y_test , params = False):
    
    mod = model[1].fit(X_train , y_train)
    preds = model[1].predict(X_test)
    accuracies = cross_val_score(estimator = model[1], X = X_train , y = y_train, cv = 10)
    cm = confusion_matrix(y_test , preds)
    cf = classification_report(y_test , preds)
    roc = roc_auc_score(y_test , model[1].predict_proba(X_test)[: , 1])
    fpr, tpr, thresholds = roc_curve(y_test, preds)
    ac = auc(fpr, tpr)
    f1 = f1_score(y_test , preds)
    
    
    print("\n")
    print(model[0])
    
    print("\n")
    if params:
        print(f"Best Parameters are : \n" , model[1].best_params_)
        print("\n")
        
    print(f"Confusion matrix : \n")
    plt.figure(figsize = (8, 5))
    sns.heatmap(cm, cmap = 'coolwarm', annot = True, annot_kws = {'fontsize': 20})
    plt.show()
    print("\n")
    
    print(f"Training score : {model[1].score(X_train , y_train):.4f}")
    print("\n") 
    
    print(f"Test Score : {model[1].score(X_test , y_test):.4f}")
    print("\n")
    
    print(f"K-fold accuracy : {np.mean(accuracies):.4f}")
    print("\n")
    
    print(f"Standard Deviation of Accuracies in k-fold : {np.std(accuracies):.4f}")
    print("\n")
    
    print(f"ROC AUC Score: {roc:.4f}")
    print('\n')
    
    print(f"F1 Score: {f1:.4f}")
    print("\n")
    
    print(f"AUC : {ac:.4f}")
    print("\n")
    
    print(f"Classification report : \n\n{cf}")
    print("\n")

    plt.figure(figsize = (8, 5))
    plot_roc_curve(model[1], X_test, y_test , color = '#FF4500')
    plt.plot([0, 1], [0, 1], linestyle = '--', color = '#7CFC00')
    plt.show()
    print("\n")
    print("*"*100)
    
    print("\n\n")
    
    sam = []
    sam.append(model[0])
    sam.append(model[1].score(X_train , y_train))
    sam.append(model[1].score(X_test , y_test))
    sam.append(np.mean(accuracies))
    sam.append(np.std(accuracies))
    sam.append(roc)
    sam.append(f1)
    sam.append(ac)
    
    return sam , mod

In [None]:
%%time

pre_final = []

for i in models:
    sam = metrics(i , X_train , y_train , X_test , y_test)
    pre_final.append(sam)

In [None]:
data_pre_final = [x[0] for x in pre_final]

## Model Evaluation and Visualization

In [None]:
me = pd.DataFrame(data_pre_final , columns = ["Model" , "Train Score" , "Test Score" , "K-fold Accuracy" , "K-fold Std" , "ROC_AUC_Score" , "F1 Score" , "AUC"])

me.sort_values(by = [ "F1 Score" , "AUC" , "ROC_AUC_Score" , "K-fold Std" , "K-fold Accuracy" , "Test Score" , "Train Score"] , inplace = True , ascending = [False , False , False , True , False , False , False])
me = me.reset_index(drop = True)
me

In [None]:
plt.figure(figsize = (10 , 6))
sns.barplot(y = "Model" , x = "F1 Score" , data = me)
plt.title("Model Comparision based on F1 Score");

In [None]:
plt.figure(figsize = (10 , 6))
sns.barplot(y = "Model" , x = "AUC" , data = me)
plt.title("Model Comparision based on AUC");

In [None]:
plt.figure(figsize = (10 , 6))
sns.barplot(y = "Model" , x = "ROC_AUC_Score" , data = me)
plt.title("Model Comparision based on ROC_AUC_Score");

In [None]:
plt.figure(figsize = (10 , 6))
sns.barplot(y = "Model" , x = "K-fold Accuracy" , data = me)
plt.title("Model Comparision based on K-fold Accuracy");

## Model Evaluation with Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

voting_models = models

In [None]:
voting_soft = VotingClassifier(estimators = voting_models , voting = "soft")

In [None]:
voting_soft.fit(X_train , y_train)

In [None]:
def metrics_others(model , X_train , y_train , X_test , y_test , params = False):
    
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test , preds)
    cf = classification_report(y_test , preds)
    roc = roc_auc_score(y_test , model.predict_proba(X_test)[: , 1])
    fpr, tpr, thresholds = roc_curve(y_test, preds)
    ac = auc(fpr, tpr)
    f1 = f1_score(y_test , preds)
    
    print(f"Confusion matrix : \n")
    plt.figure(figsize = (8, 5))
    sns.heatmap(cm, cmap = 'coolwarm', annot = True, annot_kws = {'fontsize': 20})
    plt.show()
    print("\n")
    
    print(f"Training score : {model.score(X_train , y_train):.4f}")
    print("\n") 
    
    print(f"Test Score : {model.score(X_test , y_test):.4f}")
    print("\n")
    
    print(f"ROC AUC Score: {roc:.4f}")
    print('\n')
    
    print(f"F1 Score: {f1:.4f}")
    print("\n")
    
    print(f"AUC : {ac:.4f}")
    print("\n")
    
    print(f"Classification report : \n\n{cf}")
    print("\n")

    plt.figure(figsize = (8, 5))
    plot_roc_curve(model, X_test, y_test , color = '#FF4500')
    plt.plot([0, 1], [0, 1], linestyle = '--', color = '#7CFC00')
    plt.show()
    print("\n")
    print("*"*100)
    
    print("\n\n")
    
    sam = []
    sam.append(model.score(X_train , y_train))
    sam.append(model.score(X_test , y_test))
    sam.append(roc)
    sam.append(f1)
    sam.append(ac)
    
    return sam

In [None]:
soft = metrics_others(voting_soft , X_train , y_train , X_test , y_test)

## Model Evaluation with Catboost

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(loss_function = "MultiClass", 
                         eval_metric = "TotalF1",
                         random_seed = 42 , 
                         classes_count = 2 ,
                         depth = 10 ,
                         iterations = 3500 , 
                         learning_rate = 0.1 ,
                         leaf_estimation_iterations = 1 ,
                         l2_leaf_reg = 1 ,
                         bootstrap_type = "Bayesian" , 
                         bagging_temperature = 1 , 
                         random_strength = 1 ,
                         od_type = "Iter", 
                         border_count = 100 ,
                         od_wait = 50)

In [None]:
%%time

cat.fit(X_train , y_train , use_best_model = True , eval_set=[(X_test , y_test)] , verbose = True)

In [None]:
cat_preds = cat.predict(X_test)

In [None]:
f1_score(y_test , cat_preds)

In [None]:
final_cat = metrics_others(cat , X_train , y_train , X_test , y_test)

## Final model can be Logistic Regression / CatBoost Classifier

### CatBoost

In [None]:
f1_score(y_test , cat.predict(X_test))

In [None]:
roc_auc_score(y_test , cat.predict_proba(X_test)[: , 1])

### Linear Model

In [None]:
linear = pre_final[5][1]

In [None]:
f1_score(y_test , linear.predict(X_test))

In [None]:
roc_auc_score(y_test , linear.predict_proba(X_test)[: , 1])

## Since CatBoost Has More F1_Score , ROC_AUC_Score . We use Catboost

In [None]:
f1_score(y_test , cat.predict(X_test))

# Don't forget to upvote if you like the notebook . Thank You .  