# Breast Cancer Predictions

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

## Loading up the data

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df.head()

In [None]:
df = df.drop(['Unnamed: 32','id'], axis = 1)
df.head()

In [None]:
df.describe()

In [None]:
# Looking for missing values if any
df.isna().sum()

In [None]:
# Having a look at the correlation matrix

fig, ax = plt.subplots(figsize=(15,15)) 
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=False);

#### `Malignant (M)`: has the potential to be dangerous 
#### `Benign (B)`: not dangerous to health  

In [None]:
fig, ax = plt.subplots(figsize=(7,6))
sns.countplot(df["diagnosis"], palette=["firebrick","seagreen"]);

In [None]:
# Making the dataset all numerical
# Malignant: 1, Benign: 0

df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
df.head()

## Splitting the data into training and test datasets
Here, we are trying to predict if the diagnosis is Malignant(M) or Benign(B) using the given data. Hence, the `diagnosis` will be the the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("diagnosis", axis=1)

In [None]:
X.head()

In [None]:
# y data
y = df["diagnosis"]
y.head()

In [None]:
# Splitting the data into X train, X test and y train, y test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

## **Training the model**

## RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.predict(X_test)

In [None]:
RandomForestClassifierScore = model.score(X_test,y_test)

In [None]:
print("Accuracy obtained by Random Forest Classifier model:",RandomForestClassifierScore*100)

## KNeighborsClassifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf1 = KNeighborsClassifier(42)

In [None]:
clf1.fit(X_train,y_train)

In [None]:
clf1.predict(X_test)

In [None]:
KNeighborsClassifierScore = clf1.score(X_test,y_test)
print("Accuracy obtained by K Neighbors Classifier model:",KNeighborsClassifierScore*100)

## DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [None]:
tree.fit(X_train,y_train)

In [None]:
tree.predict(X_test)

In [None]:
DecisionTreeClassifierScore = tree.score(X_test,y_test)
print("Accuracy obtained by Decision Tree Classifier model:",DecisionTreeClassifierScore*100)

## CatBoostClassifier

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=10)
cat.fit(X_train, y_train);

In [None]:
CatBoostClassifierScore = cat.score(X_test,y_test)
print("Accuracy obtained by CatBoost Classifier model:",CatBoostClassifierScore*100)

## AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(base_estimator = None)
adb.fit(X_train,y_train)

In [None]:
AdaBoostClassifierScore = cat.score(X_test,y_test)
print("Accuracy obtained by AdaBoost Classifier model:",AdaBoostClassifierScore*100)

## Comparing performance of the models

In [None]:
x = ["Decision Tree Classifier", "K Neighbors Classifier", "Random Forest Classifier", "AdaBoost Classifier", "CatBoost Classifier"]
y = [DecisionTreeClassifierScore, KNeighborsClassifierScore, RandomForestClassifierScore, AdaBoostClassifierScore, CatBoostClassifierScore]

fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Model Accuracy")
plt.xticks(rotation=40);