In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)
from sklearn.metrics import accuracy_score, recall_score, precision_score , confusion_matrix, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from IPython.display import display,Markdown,HTML
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head(5)

In [None]:
df.columns

In [None]:
df = df.drop(['id','Unnamed: 32'],axis=1)

In [None]:
df.columns

In [None]:
df['diagnosis'] = [1 if x == 'M' else 0 for x in df['diagnosis']]
df['diagnosis']

In [None]:
text_negative = "Negative"
text_positive = "Positive"
target_column = "diagnosis"

df_all = df.copy()

df_positive = df[df[target_column]==1]

df_negative = df[df[target_column]==0]

In [None]:
def plot_pie(column, title="All Group/Class"):
    fig,axs = plt.subplots(1,1)
    data = df_all[column].value_counts()
    plt.pie(data,autopct='%1.2f%%',labels=data.index)
    plt.title(title)
    plt.show()
    
def plot_hist(column, title="All Group/Class"):
    plt.hist(df_all[column],density=True)
    plt.title(title)
    plt.show()

def plot_bar(column, sort=False, title="All Group/Class"):
    if sort:
        data_all = df_all[column].value_counts().sort_index()
    else:
        data_all = df_all[column].value_counts()
    plt.bar(data_all.index.astype(str),data_all)
    plt.title(title)
    plt.show()
    
def plot_bar_compare(column, sort=False):
    if sort:
        data_positive = df_positive[column].value_counts().sort_index()
        data_negative = df_negative[column].value_counts().sort_index()
    else:
        data_positive = df_positive[column].value_counts()
        data_negative = df_negative[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].bar(data_negative.index.astype(str),data_negative)
    axs[0].title.set_text(text_negative)
    axs[1].bar(data_positive.index.astype(str),data_positive)
    axs[1].title.set_text(text_positive)
    plt.show()

def plot_hist_compare(column, bins=5):
    plt.hist([df_negative[column], df_positive[column]] , color=['c','r'])
    plt.legend((text_negative, text_positive))
    plt.show()
    
def plot_pie_compare(column):
    data_positive = df_positive[column].value_counts()
    data_negative = df_negative[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].pie(data_negative,autopct='%1.2f%%',labels=data_negative.index)
    axs[0].title.set_text(text_negative)
    axs[1].pie(data_positive,autopct='%1.2f%%',labels=data_positive.index)
    axs[1].title.set_text(text_positive)
    plt.show()

def plot_boxplot(column, title=""):
    ax = sns.boxplot(x=target_column, y=column, palette=["c", "r"],
            hue=target_column,  data=df_all).set_title(title, fontsize=15)
    plt.show()

def check_median(column):
    data_negative = df_negative[column].describe()
    data_positive = df_positive[column].describe()
    print("Median:")
    print('{}: {}'.format(text_negative,data_negative['50%']))
    print('{}: {}'.format(text_positive,data_positive['50%']))

def check_most(column):
    data_negative = df_negative[column].value_counts()
    data_positive = df_positive[column].value_counts()
    print("Most:")
    print('{}: {}'.format(text_negative,data_negative.index[0]))
    print('{}: {}'.format(text_positive,data_positive.index[0]))

In [None]:
def eda(df_all):
    display(HTML('<h1>Exploratory Data Analysis<h1>'))
    
    for column in df_all.columns:
        if column == target_column:
            continue
        display(HTML('<h2>{}<h2>'.format(column)))
        if df[column].dtype == 'int64' or df[column].dtype == 'float64':
            if len(df[column].unique())>10 :
                plot_boxplot(column)
                check_median(column)
            else:
                plot_bar(column)
                plot_pie(column)
                plot_pie_compare(column)
                check_most(column)
        elif df[column].dtype == 'object':
            if len(df[column].unique())>10 :
                df[column].value_counts().head(5)
            else:
                plot_bar(column)
                plot_pie(column)
                plot_pie_compare(column)
                check_most(column)
        else:
            None

In [None]:
df['diagnosis'].value_counts()

In [None]:
plot_pie('diagnosis')

In [None]:
eda(df_all)

In [None]:
data = df.corr()

In [None]:
data = data.sort_values(by='diagnosis',ascending=False)
data['diagnosis']

In [None]:
data[data['diagnosis']>0.5].index

# Data Preprocessing

In [None]:
X = df.copy()

y = X['diagnosis']

X = X.drop(['diagnosis'], axis=1)

In [None]:
X.columns

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=1234)

X_sm, y_sm = sm.fit_resample(X_train, y_train)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')

print('\nBalance of positive and negative classes (%):')
y_sm.value_counts(normalize=True) * 100

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_sm = sc.fit_transform(X_sm)
X_test = sc.transform(X_test)

In [None]:
# Import ML Libraries
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

classifiers = [[CatBoostClassifier(verbose=0),'CatBoost Classifier'],[XGBClassifier(eval_metric='error'),'XGB Classifier'], [RandomForestClassifier(),'Random Forest'], 
    [KNeighborsClassifier(), 'K-Nearest Neighbours'], [SGDClassifier(),'SGD Classifier'], [SVC(),'SVC'],[LGBMClassifier(),'LGBM Classifier'],
              [GaussianNB(),'GaussianNB'],[DecisionTreeClassifier(),'Decision Tree Classifier'],[LogisticRegression(),'Logistic Regression'],[AdaBoostClassifier(),"AdaBoostClassifier"]]

In [None]:
for cls in classifiers:
    model = cls[0]
    model.fit(X_sm, y_sm)
    
    y_pred = model.predict(X_test)
    print(cls[1])
    print ('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
    print("Recall : ", recall_score(y_test, y_pred) *  100)
    print("Precision : ", precision_score(y_test, y_pred) *  100)
    print("F1 : ", f1_score(y_test, y_pred) *  100)
    print("ROC AUC : ", roc_auc_score(y_test, y_pred) *  100)
    print('\n\n')

Best Algorithms for predicting breast cancer are **SGD Classifier**, **SVC** , and **Logistic Regression**

* Accuracy :  96.49122807017544
* Recall :  91.11111111111111
* Precision :  100.0
* F1 :  95.34883720930233
* ROC AUC :  95.55555555555554