# Using the Wisconsin breast cancer diagnostic dataset

# Load the libraries and data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df.drop('Unnamed: 32', axis = 1, inplace = True)
df.diagnosis = df.diagnosis.map({'B': 0, 'M': 1})
print(df.head())
#print(df.describe())

# Exploratory Data Analysis (EDA)

In [None]:
#Heatmap - to see where we are missing data
plt.figure(figsize = (10, 8))
heatmap = sns.heatmap(df.isnull(), yticklabels = False, cbar = True, cmap = 'RdPu', vmin = 0, vmax = 1)
plt.show()

**The blank graph shows we have no null values present in data**

In [None]:
#Countplot for histogram accross categories
plt.figure(figsize = (10, 8))
sns.set_style('whitegrid')
sns.countplot(data = df, x = 'diagnosis', palette = 'husl')
plt.show()

In [None]:
#Histogram 
plt.figure(figsize = (10, 8))
plt.hist(df.radius_mean, bins = 25)
plt.show()

In [None]:
#Boxplot to see radius average value
plt.figure(figsize = (10, 8))
sns.boxplot(x = 'diagnosis', y = 'radius_mean', data = df)
plt.show()

**From the boxplot, we know that ones diagnosed with cancer have larger radius**

In [None]:
df.groupby('diagnosis')['radius_mean'].mean()

# Building the model

In [None]:
#step 1: split the data
X = df.drop('diagnosis', axis = 1)
y = df.diagnosis

#step 2: split data into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 70)

#step 3: train and predict
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
pred = logreg.predict(X_test)

# Evaluate the model

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

#Check the accuracy 
score = logreg.score(X_test, y_test)
print(score)

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

**The Confusion Matrix shows that out of 94 predicted to not have cancer, 2 were falsely classified (FP - they actually had cancer)
and of 77 predicted to not have cancer, 6 were falsely classified (FN - they did not have cancer)**