In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix,classification_report,precision_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# Importing Wisconsin Breast Cancer data
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df

In [None]:
# Understanding all the datatypes in imported data.
df.info()

In [None]:
# Finding any missing values in the data
df.isna().sum()

In [None]:
df.drop('Unnamed: 32',axis=1,inplace=True)
df

In [None]:
# As ID columns is not required for the analysis we can drop it.
df.drop('id', axis=1, inplace=True)

In [None]:
# Mapping 'M' and 'B' to 1 and 0 respectively
df['diagnosis']=df['diagnosis'].map({'M':1, 'B':0})

In [None]:
# Counting Malignant and Benign cases.
benign,malignant=df['diagnosis'].value_counts()
print('Number of Malignant cases are', malignant)
print('Number of Benign cases are', benign)

In [None]:
# Using countplot to represet the Malignant and Benign cases,
ax=sns.countplot(x='diagnosis', data=df)

In [None]:
# Using pairplot to plot the most useful features
cols=['diagnosis','radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean','smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
plt.figure(figsize=(20,20))
sns.pairplot(data=df[cols],hue='diagnosis', palette='RdBu')

In [None]:
# Finding the correlation between independent vars. so that if two or more variables are highly correlated,
# we keep only one of them as its a duplicacy of the data
corr=df.corr()
corr

In [None]:
# Using heat map to graphically represnt the correlation to get better understanding of all values.
mask=np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)]
f, ax = plt.subplots(figsize=(20, 20))
cmap=sns.diverging_palette(220,10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap="YlGnBu", center=0, square=True, linewidths=.5, cbar_kws={"orientation": "vertical"}, annot=True)
plt.tight_layout()

If we observe mean columns and worst columns in the heat map above we can observe that correlation values are very similar. So, we can remove the worst columns from the dataset for further analysis.

In [None]:
df.drop(['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 
        'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'], axis=1, inplace=True)
df

Since SE(Standard Error) is error in measurement, they can be considered as less important for the anaylsis they can also be dropped.

In [None]:
df.drop(['radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se' ,
         'symmetry_se', 'fractal_dimension_se'], axis=1, inplace=True)

In [None]:
corr1=df.corr()
mask=np.zeros_like(corr1, dtype=np.bool)
mask[np.triu_indices_from(mask)]
f, ax = plt.subplots(figsize=(20, 20))
cmap=sns.diverging_palette(220,10, as_cmap=True)
sns.heatmap(corr1, mask=mask, cmap="YlGnBu", center=0, square=True, linewidths=.5, cbar_kws={"orientation": "vertical"}, annot=True)
plt.tight_layout()

It can be observerd that radius_mean, perimeter_mean, area_mean, concavity_mean, concave points_mean have correlation values of ~0.76. So we can keep one field and drop as that would not affect model.

In [None]:
df.drop(['perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean'], axis=1, inplace=True)

In [None]:
train_x, train_y, test_x, test_y=train_test_split(df, df['diagnosis'],test_size=0.3, random_state=5)

In [None]:
prediction_var=['radius_mean', 'texture_mean', 'smoothness_mean', 'compactness_mean', 'symmetry_mean', 'fractal_dimension_mean']

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=5)
print("train shape is ", train.shape)
print("test shape is ", test.shape)

In [None]:
train_x=train[prediction_var]
test_x=test[prediction_var]
train_y=train['diagnosis']
test_y=test['diagnosis']
print("shape of train_x is ", train_x.shape)
print("shape of train_y is ", train_y.shape)
print("shape of test_x is ", test_x.shape)
print("shape of test_y is ", test_y.shape)

In [None]:
model=LogisticRegression()
model.fit(train_x, train_y)
pv=model.predict(test_x)
print("accuracy score of the model is ", accuracy_score(pv, test_y))

In [None]:
predict=model.fit(train_x, train_y).predict(test_x)
predict[1:6]

In [None]:
prediction_nominal=[0 if x<0.5 else 1 for x in predict ]
prediction_nominal[1:6]

In [None]:
print(classification_report(test_y, prediction_nominal, digits=3))
cfm=confusion_matrix(test_y, prediction_nominal)
true_negative=cfm[0][0]
false_positive=cfm[0][1]
false_negative=cfm[1][0]
true_positive=cfm[1][1]

print("Confusion Matrix: \n",cfm,'\n')
print("true negative", true_negative)
print("False Positive", false_positive)
print("false Negative", false_negative)
print("True Positive", true_positive)

print("correct prediction", round((true_negative+true_positive)/len(prediction_nominal) *100, 2),'%')