In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

**Reading the file**

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv',index_col=0)

In [None]:
data.head(10)

#### Attribute Information:

(1) ID number

(2) Diagnosis (M = malignant, B = benign)

(3-32)Ten real-valued features are computed for each cell nucleus:

    a) radius (mean of distances from center to points on the perimeter)
    b) texture (standard deviation of gray-scale values)
    c) perimeter
    d) area
    e) smoothness (local variation in radius lengths)
    f) compactness (perimeter^2 / area - 1.0)
    g) concavity (severity of concave portions of the contour)
    h) concave points (number of concave portions of the contour)
    i) symmetry
    j) fractal dimension ("coastline approximation" - 1)

In [None]:
data.shape

In [None]:
data.describe()

<h3><b>Handling the missing data in the dataset</b></h3>

In [None]:
data.isna().sum()

In [None]:
data = data.drop(columns = data.columns[-1])

In [None]:
print(data.dtypes)

In [None]:
data.diagnosis.value_counts()

In [None]:
plt.rcdefaults()
plt.figure()
benign = len(data[data['diagnosis'] == 'B'])
malignant = len(data[data['diagnosis'] == 'M'])
fig, ax = plt.subplots()
y = ('Benign', 'Malignant')
y_pos = np.arange(len(y))
x = (benign, malignant)
ax.barh(y_pos, x, align='center')
ax.set_xticks(np.arange(0,401,50))
ax.set_yticks(y_pos)
ax.set_yticklabels(y)
ax.invert_yaxis() # labels read top-to-bottom
ax.set_xlabel('Count')
ax.set_title('Diagnosis')
for i, v in enumerate(x):
    ax.text(v + 10, i, str(v), color='black', va='center', fontweight='normal')
plt.show()

In [None]:
columns = data.columns
X = data[columns[1:]]
y = data[columns[0]]

In [None]:
X = data.drop('diagnosis',1)
y = data['diagnosis']
print("Shape of X and Y:",X.shape,y.shape)

<h3><b>Splitting the data into training and testing dataset</b></h3>

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print('shape of X and y respectively (train) :', X_train.shape, y_train.shape)
print('shape of X and y respectively (test) :', X_test.shape, y_test.shape)

<h2><b>Logistic Regression Model</b></h2>

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
model = LogisticRegression(max_iter=4550)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
score = model.score(X_train,y_train)
print("Score for training data: ", score)
score = model.score(X_test,y_test)
print("Score for testing data: ", score)
score_log_reg = score
print("")
print("Classification Report:")
print(classification_report(y_test,y_pred))

<h4><b>Confusion Matrix</b></h4>

In [None]:
from sklearn.metrics import confusion_matrix
confusion_mat = confusion_matrix(y_test,y_pred)
class_names = y_pred
tick_marks = np.arange(len(class_names))
fig, ax = plt.subplots()
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(confusion_mat), annot=True, cmap='Reds' , fmt='g')
plt.tight_layout()
plt.title('Confusion matrix (0 = benign and 1 = malignant)')
plt.ylabel('Actual Value')
plt.xlabel('Predicted Value')
plt.show()

<h4><b>OCR Curve</b></h4>

In [None]:
from sklearn.metrics import roc_auc_score,roc_curve
y_probabilities = model.predict_proba(X_test)[:,1]
false_positive_rate, true_positive_rate, threshold = roc_curve(y_test.replace('B',0).replace('M',1),y_probabilities)
plt.figure(figsize = (10,6))
plt.title('OCR Curve')
plt.plot(false_positive_rate, true_positive_rate, linewidth = 4, color = "green")
plt.plot([0,1], ls='--', linewidth=4)
plt.plot([0,0], [1,0], c='.5')
plt.plot([1,1], c='.5')
plt.text(0.2, 0.6, 'AUC: {:.2f}'.format(roc_auc_score(y_test,y_probabilities)), size = 16)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()