# Logistic regression analysis

Data obtained from data.world:
https://data.world/uci/breast-cancer-wisconsin-prognostic/workspace/file?filename=breast-cancer-wisconsin.names.txt

In [32]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from scipy.special import expit

In [2]:
# Load data and add column names
data = pd.read_csv('../raw_data/breast-cancer-wisconsin.data.csv', header=None)
data.columns = ['ID', 'clump_thickness', 'uniformity_cell_size', 
                'uniformity_cell_shape', 'marginal_adhesion',
                'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli', 'mitosis',
                'class']


In [3]:
# Wrangle data
data['is_malign'] = np.where(data['class'] == 4, 1, 0)
# Zero is benign, One is malign
data.head(2)

Unnamed: 0,ID,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis,class,is_malign
0,1000025,5,1,1,1,2,1,3,1,1,2,0
1,1002945,5,4,4,5,7,10,3,2,1,2,0


In [50]:
data.tail(2)

Unnamed: 0,ID,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis,class,is_malign
697,897471,4,8,6,4,3,4,10,6,1,4,1
698,897471,4,8,8,5,4,5,10,4,1,4,1


In [4]:
# Split data
# We exclude the ID, bare_nuclei, and class columns
features = ['clump_thickness', 'uniformity_cell_size', 
            'uniformity_cell_shape', 'marginal_adhesion',
            'single_epithelial_cell_size',
            'bland_chromatin', 'normal_nucleoli', 'mitosis']
x = data[features]
y = data['is_malign']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [5]:
# Train model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [51]:
# Print confusion matrix
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
conf_matrix

array([[108,   4],
       [  6,  57]])

In [16]:
conf_matrix = pd.DataFrame(conf_matrix)
conf_matrix.columns = ["Predicted: negatives", "Predicted: positives"]
conf_matrix.index = ["Reality: negatives", "Reality: positives"]
conf_matrix

Unnamed: 0,Predicted: negatives,Predicted: positives
Reality: negatives,108,4
Reality: positives,6,57


We have 108 True Negatives and 57 True Positives

We have 6 False negatives and 4 false positives

Thus, the model seems to be not that bad.
Now, let's check its accuracy:

In [20]:
print(f'The accuracy of the model is: 
      {round(metrics.accuracy_score(y_test, y_pred), 2)}')

The accuracy of the model is: 0.94


In [23]:
features

['clump_thickness',
 'uniformity_cell_size',
 'uniformity_cell_shape',
 'marginal_adhesion',
 'single_epithelial_cell_size',
 'bland_chromatin',
 'normal_nucleoli',
 'mitosis']

In [22]:
model.coef_

array([[ 0.62287416, -0.06207276,  0.50089592,  0.4363401 ,  0.3176869 ,
         0.53684089,  0.1350644 ,  0.53256665]])

In [48]:
expit(model.coef_)

array([[0.65087195, 0.48448679, 0.62266985, 0.60738661, 0.57876043,
        0.63107722, 0.53371486, 0.63008154]])

- One unit change in clump_thickness increases the odds of the tissue being malign by about 65%

- One unit change in uniformity_cell_size increases the odds of the tissue being malign by about 48%

- One unit change in uniformity_cell_shape increases the odds of the tissue being malign by about 62%

- One unit change in marginal_adhesion increases the odds of the tissue being malign by about 61%

...etc

