In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
pd.set_option('use_inf_as_na', True)
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score as acs
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.utils import resample
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings("ignore")

In [None]:
data=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
data.drop(["Unnamed: 32"],axis=1,inplace=True)
display(data.head())

In [None]:
#New data with "M"=1 and "B"=0
data1=data.copy()
def classifier(data1):
    if data1["diagnosis"]=="M":
        return "1"
    else:
        return "0"
data1["diagnosis"] = data1.apply(classifier, axis=1)
data1.replace([np.inf, -np.inf], np.nan, inplace=True)
data1["diagnosis"]=pd.to_numeric(data1["diagnosis"],errors="coerce")

# __Data Preprocessing__ 

In [None]:
print(data1.columns,data.shape)
# print(data1.info())
# print(data1.describe().T)
# print(data1.nunique())

In [None]:
sns.set(style="whitegrid")
print(data['diagnosis'].value_counts())
fig = plt.figure(figsize = (10,6))
sns.countplot('diagnosis', data=data, palette='gist_heat')
plt.show()

In [None]:
color = sns.color_palette("pastel")

fig, ax1 = plt.subplots(8,4, figsize=(30,60))
k = 0
columns = list(data1.columns)
for i in range(8):
    for j in range(4):
            sns.distplot(data1[columns[k]], ax = ax1[i][j], color = 'red')
            plt.xlabel(columns[k],size=20)
            k += 1
plt.show()

> **Note:** Most of our columns are highly skewed towards right. These columns include compactness_mean, concavity_mean, concave points_mean, radius_se, perimeter_se, smoothness_se, compactness_se, concavity_se, symmetry_se, fractal_dimension_se, area_worst, compactness_worst, concavity_worst. So we need to tranform them. Applying a log transfrom will solve the problem!

In [None]:
#Log transform
def log_transform(col):
    return np.log(col[0])

data1["compactness_mean"]=data1[["compactness_mean"]].apply(log_transform, axis=1)
data1["concavity_mean"]=data1[["concavity_mean"]].apply(log_transform, axis=1)
data1["concave points_mean"]=data1[["concave points_mean"]].apply(log_transform, axis=1)
data1["radius_se"]=data1[["radius_se"]].apply(log_transform, axis=1)
data1["perimeter_se"]=data1[["perimeter_se"]].apply(log_transform, axis=1)
data1["smoothness_se"]=data1[["smoothness_se"]].apply(log_transform, axis=1)
data1["compactness_se"]=data1[["compactness_se"]].apply(log_transform, axis=1)
data1["concavity_se"]=data1[["concavity_se"]].apply(log_transform, axis=1)
data1["symmetry_se"]=data1[["symmetry_se"]].apply(log_transform, axis=1)
data1["fractal_dimension_se"]=data1[["fractal_dimension_se"]].apply(log_transform, axis=1)
data1["area_worst"]=data1[["area_worst"]].apply(log_transform, axis=1)
data1["compactness_worst"]=data1[["compactness_worst"]].apply(log_transform, axis=1)
data1["concavity_worst"]=data1[["concavity_worst"]].apply(log_transform, axis=1)

In [None]:
color = sns.color_palette("pastel")

fig, ax1 = plt.subplots(8,4, figsize=(30,60))
k = 0
columns = list(data1.columns)
for i in range(8):
    for j in range(4):
        sns.distplot(data1[columns[k]], ax = ax1[i][j], color = 'green')
        k += 1
plt.show()

**Correlation plot between the features:**

In [None]:
plt.figure(figsize=(16,8))
corr=data1.drop(["id"],axis=1).corr()
sns.heatmap(corr,annot=True,linewidth=1)
plt.show()

#Correaltion of features in descending order
print(data1.corr()['diagnosis'].sort_values(ascending=False))

plt.figure(figsize=(16,8))
plt.plot(data1.corr()['diagnosis'].sort_values(ascending=False)[1:],color="cyan")
plt.title("Correlation of different features with 'Diagnosis'")
plt.xticks(rotation=90)
plt.show()

**Boxplot of top 5 corrrelated features**

In [None]:
sns.boxplot(data=data,x="diagnosis",y="concave points_worst")
plt.show()
sns.boxplot(data=data,x="diagnosis",y="perimeter_worst")
plt.show()
sns.boxplot(data=data,x="diagnosis",y="concave points_mean")
plt.show()
sns.boxplot(data=data,x="diagnosis",y="radius_worst")
plt.show()
sns.boxplot(data=data,x="diagnosis",y="perimeter_mean")
plt.show()

**Now we will be doing undersampling and oversampling.**

* The simplest implementation of over-sampling is to duplicate random records from the minority class, which can cause overfishing.
* In under-sampling, the simplest technique involves removing random records from the majority class, which can cause loss of information.

In [None]:
data_M = data1[data1.diagnosis==1]     #Minority
data_B = data1[data1.diagnosis==0]     #Majority

data_M_upsampled=resample(data_M,replace=True, n_samples=300, random_state=12)
data_B_downsampled= data_B.sample(n=300).reset_index(drop=True)

#New dataset for balanced data
Balanced_df = pd.concat([data_M_upsampled, data_B_downsampled]).reset_index(drop=True)

In [None]:
print(Balanced_df["diagnosis"].value_counts())
plt.figure(figsize=(10,6))
sns.countplot(x='diagnosis', data=Balanced_df, palette='gist_heat')
plt.show()

**Now the count for our output variable "diagnosis" has been made equal**

In [None]:
plt.figure(figsize=(15,15))
Balanced_df.corr().diagnosis.apply(lambda x: abs(x)).sort_values(ascending=False).iloc[1:21][::-1].plot(kind='barh',color='cyan') 
# calculating the top 20 highest correlated features
# with respect to the target variable i.e. "quality"
plt.title("Top 20 highly correlated features", size=20, pad=26)
plt.xlabel("Correlation coefficient")
plt.ylabel("Features")
plt.show()

We will only be using the top 20 correlated features to train our model, this will hellp improve the accuacy. 

In [None]:
selected_features=Balanced_df.corr().diagnosis.sort_values(ascending=False).iloc[1:21][::-1].index

X = Balanced_df[selected_features]
Y = Balanced_df.diagnosis

**Split data into training and testing sets**

In [None]:
X=data.iloc[:,2:32]
Y=data.iloc[:,1]

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=0) 

#Feature Scaling
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

# Model 1=>Logistic Regression

In [None]:
LR_model=LogisticRegression(random_state=0)
LR_model.fit(X_train,Y_train)

In [None]:
Y_pred=LR_model.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
Logistic_Regression_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), Logistic_Regression_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 2=>Support Vector Machine

In [None]:
svm=SVC(kernel="rbf",random_state=0)
svm.fit(X_train,Y_train)

In [None]:
Y_pred=svm.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
SVM_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), SVM_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 3=>Decision Tree

In [None]:
tree=DecisionTreeClassifier(random_state=10)
tree.fit(X_train,Y_train)

In [None]:
Y_pred=tree.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
Decision_Tree_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), Decision_Tree_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 4=>Random Forest Classifier

In [None]:
rfc=RandomForestClassifier(n_estimators=60,random_state=0)
rfc.fit(X_train,Y_train)

In [None]:
Y_pred=rfc.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
Random_forest_classifier_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), Random_forest_classifier_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 5=>Gradient Boosting Classifier

In [None]:
gbc=GradientBoostingClassifier(random_state=11)
gbc.fit(X_train,Y_train)

In [None]:
Y_pred=gbc.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
Gradient_Boosting_Classifier_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), Gradient_Boosting_Classifier_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 6=>XGBoost Classifier

In [None]:
xgb=XGBClassifier(random_state=0,booster="gbtree")
xgb.fit(X_train,Y_train)

In [None]:
Y_pred=xgb.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
XG_boost_classifier_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), XG_boost_classifier_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Model 7=>K-Nearest Neighbor (KNN) classification

In [None]:
knn=KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train,Y_train)

In [None]:
Y_pred=knn.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
KNN_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), KNN_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

**Method to find the best value of *n_neighbors* based on accuracy**

In [None]:
val=10   #Max value of n_neighbor
model=knn  #Name of model you want to train (I'm training my KNN model)
for K in range(val):
    K_value = K+1
    model = KNeighborsClassifier(n_neighbors=K_value)
    model.fit(X_train,Y_train)
    Y_pred = model.predict(X_test)
    print("Accuracy is : ", acs(Y_test,Y_pred)*100,"% for n_neighbors: ", K_value)

# Model 8 => MLP Classifier

In [None]:
classifier = MLPClassifier(random_state=1,hidden_layer_sizes=(150,100,50), max_iter=300,activation = 'relu',solver='adam')
classifier.fit(X_train, Y_train)

In [None]:
Y_pred=classifier.predict(X_test)

In [None]:
cm=confusion_matrix(Y_pred,Y_test)
class_label = ["malignant", "benign"]
df_cm = pd.DataFrame(cm, index=class_label,columns=class_label)

precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label="M", average='binary')
MLP_classifier_accuracy=round(acs(Y_test,Y_pred), 4)*100
print('Precision: {} \nRecall: {} \nF1-Score: {} \nAccuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore,3), MLP_classifier_accuracy) +"% \n")

sns.heatmap(df_cm,annot=True,cmap='Pastel1',linewidths=2,fmt='d')
plt.title("Confusion Matrix",fontsize=15)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

**Comparison of Accuracy**

In [None]:
accuracies={"Random Forest": Random_forest_classifier_accuracy,
            "SVM": SVM_accuracy,
            "MLP Classifier": MLP_classifier_accuracy,
            "Gradient Boosting": Gradient_Boosting_Classifier_accuracy,
            "XG Boost": XG_boost_classifier_accuracy,
            "KNN": KNN_accuracy,
            "Logistic regression": Logistic_Regression_accuracy,
            "Decision Tree": Decision_Tree_accuracy}

#Plot accuracy for different models
plt.figure(figsize=(14,6))
plt.bar(accuracies.keys(),accuracies.values(),label="Accuracy")
plt.xlabel("Classifier Used")
plt.ylabel("Accuracy (%)")
plt.ylim(90,100)
plt.legend()
plt.tight_layout()
plt.show()

# Analysis
__After using 8 different algorithms, we got the following accuracies:__
1. Logistic Regression - **95.61%**
2. Support Vector Machine - **98.25%**
3. Decision Tree - **93.86%**
4. Random Forest Classifier - **98.25%**
5. Gradient Boosting Classfier - **96.49%**
6. XGBoost Classifier - **96.49%**
7. K-nearest neighbor classification - **96.49%**
8. MLP Classifier - **97.37%**

This clearly shows that **SVM** and **Random Forest Classifier** are the most efficient and accurate algorithms, and hence they are most widely used for classification problems.

> **NOTE:** XGBoost is also a very powerful algorithm when it comes to classification. The reason we got just 96.49% accuracy using XGboost is because the training data (X_train) was scaled at the beginning using **StandardScaler**.
> To obtain a better accuracy with XGBoost (almost 99%), train the model without scaling the training data.

**If you found this notebook useful, please do upvote!**

**If you have any suggestions or doubts, feel free to comment below!**

**Thank you!**