# Breast Cancer (Diagnostic) Data Set


**Task : To predict whether the cancer is benign or malignant**

What Are the Symptoms of Breast Cancer?

New lump in the breast or underarm (armpit).

Thickening or swelling of part of the breast.

Irritation or dimpling of breast skin.


Redness or flaky skin in the nipple area or the breast.

Pulling in of the nipple or pain in the nipple area.

Nipple discharge other than breast milk, including blood.


# IMPORTING THE LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import warnings
import os 
warnings.filterwarnings("ignore")
import datetime


# LOADING THE DATASET

In [None]:
data=pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')


In [None]:
data.head()      #displaying the head of dataset they gives the 1st to 5 rows of the data

In [None]:
data.describe()      #description of dataset 

In [None]:
data.info()

In [None]:
data.shape       #569 rows and 33 columns

In [None]:
data.columns     #displaying the columns of dataset

In [None]:
data.value_counts

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

**So we have to drop the Unnamed: 32 coulumn which contains NaN values**

In [None]:
data.drop('Unnamed: 32', axis = 1, inplace = True)


In [None]:
data

# VISUALIZING THE DATA

In [None]:
data.corr()

In [None]:
plt.figure(figsize=(18,9))
sns.heatmap(data.corr(),annot = True, cmap ="Accent_r")





In [None]:
sns.barplot(x="id", y="diagnosis",data=data[160:190])
plt.title("Id vs Diagnosis",fontsize=15)
plt.xlabel("Id")
plt.ylabel("Diagonis")
plt.show()
plt.style.use("ggplot")


In [None]:
sns.barplot(x="radius_mean", y="texture_mean", data=data[170:180])
plt.title("Radius Mean vs Texture Mean",fontsize=15)
plt.xlabel("Radius Mean")
plt.ylabel("Texture Mean")
plt.show()
plt.style.use("ggplot")


In [None]:
 
mean_col = ['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

sns.pairplot(data[mean_col],hue = 'diagnosis', palette='Accent')


In [None]:
sns.violinplot(x="smoothness_mean",y="perimeter_mean",data=data)

In [None]:
plt.figure(figsize=(14,7))
sns.lineplot(x = "concavity_mean",y = "concave points_mean",data = data[0:400], color='green')
plt.title("Concavity Mean vs Concave Mean")
plt.xlabel("Concavity Mean")
plt.ylabel("Concave Points")
plt.show()



In [None]:
worst_col = ['diagnosis','radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

sns.pairplot(data[worst_col],hue = 'diagnosis', palette="CMRmap")

# TRAINING AND TESTING DATA

In [None]:
# Getting Features

x = data.drop(columns = 'diagnosis')

# Getting Predicting Value
y = data['diagnosis']


In [None]:

#train_test_splitting of the dataset
from sklearn.model_selection import train_test_split 
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


In [None]:
print(len(x_train))


In [None]:
print(len(x_test))

In [None]:
print(len(y_train))

In [None]:
print(len(y_test))

# MODELS

# 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
reg.fit(x_train,y_train)                         


In [None]:
y_pred=reg.predict(x_test)
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",reg.score(x_train,y_train)*100)




In [None]:
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data






In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 58.7 % using logistic regression**

In [None]:
from sklearn.model_selection import GridSearchCV
param = {
         'penalty':['l1','l2'],
         'C':[0.001, 0.01, 0.1, 1, 10, 20,100, 1000]
}
lr= LogisticRegression(penalty='l1')
cv=GridSearchCV(reg,param,cv=5,n_jobs=-1)
cv.fit(x_train,y_train)
cv.predict(x_test)


In [None]:
print("Best CV score", cv.best_score_*100)

# 2. DECISION TREE CLASSIFIER

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(max_depth=6, random_state=123)

dtree.fit(x_train,y_train)

#y_pred = dtree.predict(x_test)


In [None]:
y_pred=dtree.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",dtree.score(x_train,y_train)*100)



In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 94.73 % using Decision Tree Classifier**

# 3. Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(x_train,y_train)



In [None]:
y_pred=rfc.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",rfc.score(x_train,y_train)*100)


In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 96.49 % using Random Forest Classifier**

# 4. KNeighborsClassifier



In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=7)

knn.fit(x_train,y_train)


In [None]:
y_pred=knn.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",knn.score(x_train,y_train)*100)
print(knn.score(x_test,y_test))


In [None]:
print(accuracy_score(y_test,y_pred)*100)


**So we get a accuracy score of 70.17 % using KNeighborsClassifier**

# 5. SVC

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train, y_train)


In [None]:
y_pred=svc.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",svc.score(x_train,y_train)*100)
print(svc.score(x_test,y_test))


In [None]:
print("Training Score: ",svc.score(x_train,y_train)*100)

**So we get a accuracy score of 63.7 % using SVC**

# 6. AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(base_estimator = None)
adb.fit(x_train,y_train)







In [None]:
y_pred=adb.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",adb.score(x_train,y_train)*100)

In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 98.24 % using AdaBoostClassifier**

#  7. Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier()
gbc.fit(x_train,y_train)


In [None]:
y_pred=gbc.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",gbc.score(x_train,y_train)*100)
print(gbc.score(x_test,y_test))


In [None]:
print(accuracy_score(y_test,y_pred)*100)

**So we get a accuracy score of 95.61 % using GradientBoostingClassifier**

# 8. XGBClassifier

In [None]:
from xgboost import XGBClassifier

xgb =XGBClassifier(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

xgb.fit(x_train, y_train)


In [None]:
y_pred=xgb.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print("Training Score: ",xgb.score(x_train,y_train)*100)
print(xgb.score(x_test,y_test))


In [None]:
print("Training Score: ",xgb.score(x_train,y_train)*100)

In [None]:
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data

**So we get a accuracy score of 97.80 % using  XGBClassifier**

# 9. Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train,y_train)

In [None]:
y_pred=gnb.predict(x_test)
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,mean_squared_error,r2_score
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print("Training Score: ",gnb.score(x_train,y_train)*100)
print(gnb.score(x_test,y_test))


**So we get a accuracy score of 63.29 % using Naive Bayes**

In [None]:
data = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
data

**So now we conclude the accuracy of different models:**

**1. AdaBoost Classifier = 98.24 %**

**2. XGB Classifier= 97.84 %**

**3. Random Forest Classifier =96.57 %**

**4. Gradient Boosting Classifier= 95.66%**

**5. Decision Tree Classifier= 94.78 %**

**6. K Neighbours Classifier= 70.18 %**

**7. SVC = 63.80 %**

**8. Naiye Bayes= 63.30 %**

**9. Logistic Regression = 58.82%**


**Ada Boost Classifier got the highest accuracy**

# If you liked this notebook, please UPVOTE it.