In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# 1. Explore the Data

## 1.1

In [2]:
data = pd.read_csv('./wdbc.csv.bz2')

In [3]:
data.head()

Unnamed: 0,id,diagnosis,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concpoints.mean,...,radius.worst,texture.worst,perimeter.worst,area.worst,smoothness.worst,compactness.worst,concavity.worst,concpoints.worst,symmetry.worst,fracdim.worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
data.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [5]:
data.diagnosis = (data.diagnosis == 'M').astype(int)

In [6]:
data.head()

Unnamed: 0,id,diagnosis,radius.mean,texture.mean,perimeter.mean,area.mean,smoothness.mean,compactness.mean,concavity.mean,concpoints.mean,...,radius.worst,texture.worst,perimeter.worst,area.worst,smoothness.worst,compactness.worst,concavity.worst,concpoints.worst,symmetry.worst,fracdim.worst
0,842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## 1.2

In [7]:
summary = pd.DataFrame({'variable' : [], 'correlation' : [], 'mean' : [], 'min' : [], 'max' : [], 'std' : [], 'missings' : []})

In [8]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings


In [9]:
summary.variable = data.columns

In [10]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings
0,id,,,,,,
1,diagnosis,,,,,,
2,radius.mean,,,,,,
3,texture.mean,,,,,,
4,perimeter.mean,,,,,,
5,area.mean,,,,,,
6,smoothness.mean,,,,,,
7,compactness.mean,,,,,,
8,concavity.mean,,,,,,
9,concpoints.mean,,,,,,


In [11]:
for i in summary.variable:
    index = summary.index[summary['variable'] == i]
    summary.loc[index] = [i, data['diagnosis'].corr(data[i], method='pearson'), np.mean(data[i]), np.min(data[i]), np.max(data[i]), np.std(data[i]), data[i].isnull().sum()]



In [12]:
summary

Unnamed: 0,variable,correlation,mean,min,max,std,missings
0,id,0.039769,30371830.0,8670.0,911320500.0,124910700.0,0.0
1,diagnosis,1.0,0.3725835,0.0,1.0,0.4834925,0.0
2,radius.mean,0.730029,14.12729,6.981,28.11,3.520951,0.0
3,texture.mean,0.415185,19.28965,9.71,39.28,4.297255,0.0
4,perimeter.mean,0.742636,91.96903,43.79,188.5,24.27762,0.0
5,area.mean,0.708984,654.8891,143.5,2501.0,351.6048,0.0
6,smoothness.mean,0.35856,0.09636028,0.05263,0.1634,0.01405176,0.0
7,compactness.mean,0.596534,0.104341,0.01938,0.3454,0.05276633,0.0
8,concavity.mean,0.69636,0.08879932,0.0,0.4268,0.07964973,0.0
9,concpoints.mean,0.776614,0.04891915,0.0,0.2012,0.03876873,0.0


## 2. Which Model is Best?

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

## 2.1

In [14]:
X = data[['concpoints.mean', 'fracdim.mean']].values
y = data.diagnosis

In [15]:
X.shape

(569, 2)

In [16]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X, y, test_size = 0.2)

## 2.2

In [17]:
m = LogisticRegression(solver='lbfgs', C=1e9).fit(Xtrain, ytrain)

## 2.3

In [18]:
Xtrain.shape

(455, 2)

In [19]:
yhat = m.predict(Xtrain)

In [20]:
rmse = np.sqrt(np.mean((ytrain - yhat)**2))
rmse

0.28516430582855445

In [21]:
y.shape

(569,)

In [22]:
yhat.shape

(455,)

In [23]:
pd.crosstab(ytrain, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,275,16
1,21,143


## 2.4

In [24]:
a = (150 + 273) / 455
a

0.9296703296703297

the accuracy is ~93%

In [25]:
p = 150 / (150 + 14)

In [26]:
r = 150 / (150 + 18)

In [27]:
f = 2 / (1 / p + 1 / r)
f

0.9036144578313254

the f score is 0.903

## 2.5

In [28]:
yhat = m.predict(Xvalid)

In [29]:
rmse = np.sqrt(np.mean((yvalid - yhat)**2))
rmse

0.28097574347450816

In [30]:
yhat.shape

(114,)

In [31]:
pd.crosstab(yvalid, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,62,4
1,5,43


## 2.6

In [32]:
a = (35 + 65) / 114
a

0.8771929824561403

the accuracy is 87.7%

In [33]:
r = 35 / (35 + 9)

In [34]:
p = 35 / (35 + 5)

In [35]:
f = 2 / (1 / p + 1 / r)
f

0.8333333333333334

the f score is 0.833

## 2.7

The model performs better on training data, evidenced by higher accuracy and a lower RMSE. However, there is not a drastic difference between the RMSE's, indicating that the model it not overfitting.

## 2.8

### All variables

In [36]:
X0 = data.copy().drop(['diagnosis', 'id'], axis=1)

In [37]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X0, y, test_size = 0.2)

In [38]:
m = LogisticRegression(solver='lbfgs', C=1e9).fit(Xtrain, ytrain)



In [39]:
m.coef_

array([[-1.90278825e+00, -6.35119250e-01,  1.50824754e-01,
        -1.91266335e-03,  7.66041628e-02,  3.48569635e-01,
         4.84612756e-01,  2.08980307e-01,  1.11974085e-01,
         2.22327983e-02, -8.34278985e-02, -9.91243263e-01,
        -4.75438838e-01,  1.56888191e-01,  7.25293177e-03,
         6.30303767e-02,  9.30468953e-02,  2.69229236e-02,
         2.50263254e-02,  5.97562400e-03, -1.99608246e+00,
         6.28547938e-01,  1.85299707e-01,  2.33506534e-02,
         1.37904354e-01,  1.05578295e+00,  1.29959951e+00,
         3.98142463e-01,  3.40203605e-01,  1.04386003e-01]])

In [40]:
yhat = m.predict(Xtrain)

In [41]:
np.unique(yhat)

array([0, 1])

In [42]:
rmse = np.sqrt(np.mean((ytrain - yhat)**2))
rmse

0.22966770070528583

In [43]:
yhat.shape

(455,)

In [44]:
pd.crosstab(ytrain, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,273,10
1,14,158


In [45]:
a = (159 + 278) / 455
a

0.9604395604395605

the accuracy of this model is 96%

In [46]:
r = 159 / (159 + 13)

In [47]:
p = 159 / (159 + 5)

In [48]:
F = 2 / (1 / p + 1 / r)
F

0.9464285714285714

The f score of this model is 0.946

In [49]:
yhat = m.predict(Xvalid)

In [50]:
rmse = np.sqrt(np.mean((yvalid - yhat)**2))
rmse

0.20942695414584775

In [51]:
pd.crosstab(yvalid, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,73,1
1,4,36


In [52]:
yhat.shape

(114,)

In [66]:
a = (36 + 73) / 114
a

0.956140350877193

the accuracy of this model is 95.6%

In [54]:
r = 38 / (38 + 2)

In [55]:
p = 38 / (38 + 2)

In [56]:
f = 2 / (1 / p + 1 / r)
f

0.9500000000000001

the f score of this model is 0.95

### Radius, Texture, Area, Smoothness

In [57]:
X1 = data[['radius.mean', 'texture.mean', 'area.mean', 'smoothness.mean']]

In [58]:
Xtrain, Xvalid, ytrain, yvalid = train_test_split(X1, y, test_size = 0.2)

In [59]:
m = LogisticRegression(solver='lbfgs', C=1e9).fit(Xtrain, ytrain)



In [60]:
yhat = m.predict(Xtrain)

In [61]:
rmse = np.sqrt(np.mean((ytrain - yhat)**2))
rmse

0.2693092507276208

In [62]:
yhat.shape

(455,)

In [63]:
pd.crosstab(ytrain, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,269,15
1,18,153


In [65]:
a = (153 + 269) / 455
a

0.9274725274725275

The accuracy of this model is 92.7%

In [67]:
r = 153 / (153 + 18)

In [68]:
p = 153 / (153 + 15)

In [69]:
F = 2 / (1 / p + 1 / r)
F

0.9026548672566371

The F score of this model is 90.3%

In [70]:
yhat = m.predict(Xvalid)
rmse = np.sqrt(np.mean((yvalid - yhat)**2))
rmse

0.31063037209869776

In [72]:
pd.crosstab(yvalid, yhat)

col_0,0,1
diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1
0,69,4
1,7,34


In [73]:
a = (34 + 69) / 114
a

0.9035087719298246

The accuracy of this model is 90.4%

In [74]:
r = 34 / (34 + 7)

In [75]:
p = 34 / (34 + 4)

In [76]:
F = 2 / (1 / p + 1 / r)
F

0.860759493670886

The F score of this model is 0.86

## 2.9

It seems that the model using all the variables performs the best, in terms of RMSE and accuracy.