# Logistic regression analysis

Data obtained from data.world:
https://data.world/uci/breast-cancer-wisconsin-prognostic/workspace/file?filename=breast-cancer-wisconsin.names.txt

In [16]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [14]:
# Load data and add column names
data = pd.read_csv('../raw_data/breast-cancer-wisconsin.data.csv', header=None)
data.columns = ['ID', 'clump_thickness', 'uniformity_cell_size', 
                'uniformity_cell_shape', 'marginal_adhesion',
                'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli', 'mitosis',
                'class']


In [15]:
# Wrangle data
data['is_malign'] = np.where(data['class'] == 4, 1, 0)
# Zero is benign, One is malign
data.head(2)

Unnamed: 0,ID,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitosis,class,is_malign
0,1000025,5,1,1,1,2,1,3,1,1,2,0
1,1002945,5,4,4,5,7,10,3,2,1,2,0


In [19]:
# Split data
# We exclude the ID, bare_nuclei, and class columns
features = ['clump_thickness', 'uniformity_cell_size', 
            'uniformity_cell_shape', 'marginal_adhesion',
            'single_epithelial_cell_size',
            'bland_chromatin', 'normal_nucleoli', 'mitosis']
x = data[features]
y = data['is_malign']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [20]:
# Train model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)