# Breast Cancer Prediction

In this project, we will be predicting whether a patient has breast cancer based on the features of the cell nucleus.

<b> Importing Libraries <b>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
bc = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
bc.head()

<font size="4"> <b> Exploratory Data Analysis </font> <b>

In [None]:
bc.shape

In [None]:
bc.info()

In [None]:
bc.describe()

In [None]:
bc.isnull()

In [None]:
sns.heatmap(bc.isnull(),yticklabels=False,cbar=False,cmap='Pastel1')

From the above heatmap, we can see that the column 'Unnamed' has all its values missing. So, we will have to remove that column.

In [None]:
bc.drop('Unnamed: 32',axis=1,inplace=True)

In [None]:
bc.shape

Column is removed!

In [None]:
sns.set_style("darkgrid")
sns.countplot(x='diagnosis',data=bc)

In [None]:
bc['diagnosis'].value_counts()

Since diagnosis is a categorical value, we will have to encode it.

In [None]:
#encoding categorical data values
# 1 = M, 0 = B
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
bc.iloc[:,1] = labelencoder.fit_transform(bc.iloc[:,1].values)

bc.iloc[:,1]

In [None]:
sns.pairplot(bc.iloc[:,1:6], hue = 'diagnosis')

In [None]:
bc.iloc[:,1:12].corr()

In [None]:
plt.figure(figsize=(14,10))
sns.heatmap(bc.iloc[:,1:12].corr(), annot=True)

<font size="4"> <b> Splitting and Scaling the Data </font> <b>

In [None]:
X = bc.iloc[:,2:31].values
Y = bc.iloc[:,1].values

In [None]:
# Train-test split 30-70
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.30, 
                                                    random_state=101)

In [None]:
#Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

<font size="4"> <b> Models and Accuracy </font> <b>

In [None]:
def models(X_train,y_train):
    
    #Logistic Regression
    from sklearn.linear_model import LogisticRegression
    log = LogisticRegression(random_state=0)
    log.fit(X_train, y_train)
    
    #Decision Tree
    from sklearn.tree import DecisionTreeClassifier
    decision_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    decision_tree.fit(X_train, y_train)
    
    #Random Forest
    from sklearn.ensemble import RandomForestClassifier
    random_forest = RandomForestClassifier(n_estimators=10,criterion = 'entropy', random_state=3)
    random_forest.fit(X_train, y_train)
    
    #Model Accuracy on Training Data
    print('[0]Logistic Regression Training Acc:', log.score(X_train,y_train))
    print('[1]Decision Tree Training Acc:', decision_tree.score(X_train,y_train))
    print('[2]Random Forest Training Acc:', random_forest.score(X_train,y_train))
    
    return log, decision_tree, random_forest


In [None]:
model = models(X_train,y_train)

In [None]:
# Acc on Testing Data
from sklearn.metrics import confusion_matrix

for i in range(len(model)):
    print('Model ', i)
    cm = confusion_matrix(y_test, model[i].predict(X_test))

    tp = cm[0][0]
    tn = cm[1][1]
    fp = cm[1][0]
    fn = cm[0][1]

    print(cm)
    print('Testing Acc = ', (tp + tn)/(tp +tn +fn + fp))
    print()

<font size="4"> <b> Predictions </font> <b>

In [None]:
# Logistic Regression
pred = model[0].predict(X_test)
print(pred)
print()
print(y_test)

In [None]:
# Decision Tree
pred = model[1].predict(X_test)
print(pred)
print()
print(y_test)

In [None]:
# Random Forest
pred = model[1].predict(X_test)
print(pred)
print()
print(y_test)