In [5]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# 1. Explore the Data

## 1.1

In [19]:
data = pd.read_csv('./wdbc.csv.bz2')

In [20]:
data.head()

Unnamed: 0,id,diagnosis,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concpoints.mean,...,radius.worst,texture.worst,perimeter.worst,area.worst,smoothness.worst,compactness.worst,concavity.worst,concpoints.worst,symmetry.worst,fracdim.worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [21]:
data.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [22]:
data.diagnosis = (data.diagnosis == 'M').astype(int)

In [23]:
data.head()

Unnamed: 0,id,diagnosis,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concpoints.mean,...,radius.worst,texture.worst,perimeter.worst,area.worst,smoothness.worst,compactness.worst,concavity.worst,concpoints.worst,symmetry.worst,fracdim.worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 1.2

In [24]:
summary = pd.DataFrame({'variable' : [], 'correlation' : [], 'mean' : [], 'min' : [], 'max' : [], 'std' : [], 'missings' : []})

In [25]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings


In [26]:
summary.variable = data.columns

In [27]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings
0,id,,,,,,
1,diagnosis,,,,,,
2,radius.mean,,,,,,
3,texture.mean,,,,,,
4,perimeter.mean,,,,,,
5,area.mean,,,,,,
6,smoothness.mean,,,,,,
7,compactness.mean,,,,,,
8,concavity.mean,,,,,,
9,concpoints.mean,,,,,,


In [28]:
for i in summary.variable:
    index = summary.index[summary['variable'] == i]
    summary.loc[index] = [i, data['diagnosis'].corr(data[i], method='pearson'), np.mean(data[i]), np.min(data[i]), np.max(data[i]), np.std(data[i]), data[i].isnull().sum()]



In [29]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings
0,id,0.039769,30371830.0,8670.0,911320500.0,124910700.0,0.0
1,diagnosis,1.0,0.3725835,0.0,1.0,0.4834925,0.0
2,radius.mean,0.730029,14.12729,6.981,28.11,3.520951,0.0
3,texture.mean,0.415185,19.28965,9.71,39.28,4.297255,0.0
4,perimeter.mean,0.742636,91.96903,43.79,188.5,24.27762,0.0
5,area.mean,0.708984,654.8891,143.5,2501.0,351.6048,0.0
6,smoothness.mean,0.35856,0.09636028,0.05263,0.1634,0.01405176,0.0
7,compactness.mean,0.596534,0.104341,0.01938,0.3454,0.05276633,0.0
8,concavity.mean,0.69636,0.08879932,0.0,0.4268,0.07964973,0.0
9,concpoints.mean,0.776614,0.04891915,0.0,0.2012,0.03876873,0.0


## 2. Which Model is Best?

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 2.1

In [40]:
X = data[['concpoints.mean', 'fracdim.mean']].values
y = data.diagnosis

In [46]:
X.shape

(569, 2)

In [41]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size = 0.2)

## 2.2

In [42]:
m = LogisticRegression(solver='lbfgs', C=1e9).fit(Xtrain, ytrain)

## 2.3

In [47]:
Xtrain.shape

(455, 2)

In [43]:
yhat = m.predict(Xtrain)

In [44]:
y.shape

(569,)

In [45]:
yhat.shape

(455,)

In [48]:
pd.crosstab(ytrain, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,273,14
1,18,150


## 2.4