In [35]:
# reserve the first cell, to import all dependencies (python classes) that you will need

import numpy as np
import pandas as pd
import statsmodels.api as sm

In [36]:
# keep all functions in one cell, call them when you need them. 
def read_file(path): # path = dynamic input parameter
    return pd.read_csv(path, header = None)

In [37]:
# use function to read in cancer Dataset. 
path = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
df = read_file(path)

In [38]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

In [39]:
# find data dictionary of df

1. ID number 
2. Diagnosis (M = malignant, B = benign) 

3-32. 

Ten real-valued features are computed for each cell nucleus: 

a. radius (mean of distances from center to points on the perimeter)  
b. texture (standard deviation of gray-scale values)   
c. perimeter   
d. area   
e. smoothness (local variation in radius lengths)   
f. compactness (perimeter^2 / area - 1.0)   
g. concavity (severity of concave portions of the contour)  
h. concave points (number of concave portions of the contour)  
i. symmetry  
j. fractal dimension ("coastline approximation" - 1)  

The mean, standard error, and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features.  For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

In [40]:
cols = ['radius', 'texture', 'perimeter', 'area', 'smoothness', 'compactness',\
        'concavity', 'concave_points', 'symmetry', 'fractal_dimension']

In [41]:
col_names = ['id', 'diagnosis']
means = ['mean_'+i for i in cols]
std_err = ['std_err_'+i for i in cols]
worst = ['worst_'+i for i in cols]
col_names.extend(means)
col_names.extend(std_err)
col_names.extend(worst)

In [42]:
df.columns = col_names

In [43]:
df['malignant'] = df['diagnosis'].map(lambda x: 1 if x == 'M' else 0)

In [44]:
df.head()

Unnamed: 0,id,diagnosis,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,malignant
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [45]:
# Obtain the first 10 features, which are the mean figures in the dataset, 
# call your dataFrame with only the features, data_df, and  the dataframe with only 
# the outcome ( or response/label) target_df
data_df = df.iloc[:, 2:12]
target_df = df['diagnosis']

In [46]:
data_df.head()

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


In [47]:
target_df.head()

0    M
1    M
2    M
3    M
4    M
Name: diagnosis, dtype: object

In [55]:
# scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(data_df)
scaled = scaler.transform(data_df)
features = data_df.columns.values.tolist()
for i in range(len(features)):
     data_df[features[i]] = [x[i] for x in scaled]
data_df['intercept'] = 1.0

In [56]:
# Convert DataFrame to Numpy Array
x = data_df.as_matrix(columns=None)
y = target_df.as_matrix(columns=None)

In [57]:
### specify the model, call the final fit, md1_fit, hint use MNLogit from class SM, 
# and call the .fit() method on the object
model = sm.MNLogit(y, x)
fitted = model.fit()

Optimization terminated successfully.
         Current function value: 0.128410
         Iterations 10


In [58]:
### print model summary ###
fitted.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,569.0
Model:,MNLogit,Df Residuals:,558.0
Method:,MLE,Df Model:,10.0
Date:,"Tue, 11 Oct 2016",Pseudo R-squ.:,0.8055
Time:,15:48:09,Log-Likelihood:,-73.065
converged:,True,LL-Null:,-375.72
,,LLR p-value:,1.2820000000000001e-123

y=M,coef,std err,z,P>|z|,[95.0% Conf. Int.]
x1,-7.2155,13.083,-0.551,0.581,-32.859 18.428
x2,1.6533,0.277,5.961,0.0,1.110 2.197
x3,-1.7361,12.264,-0.142,0.887,-25.773 22.301
x4,13.9925,5.886,2.377,0.017,2.457 25.528
x5,1.074,0.449,2.392,0.017,0.194 1.954
x6,-0.0772,1.073,-0.072,0.943,-2.181 2.027
x7,0.6745,0.647,1.043,0.297,-0.593 1.942
x8,2.5906,1.106,2.342,0.019,0.423 4.758
x9,0.4459,0.291,1.531,0.126,-0.125 1.017
x10,-0.4821,0.604,-0.799,0.424,-1.665 0.701


In [29]:
### print the aic and bic metrics ###
fitted.aic

166.46818872994203

In [30]:
fitted.bic

209.90699307120533

In [32]:
### get the marginal effects of your fitted model md1_fit ###
mdl_margeff = fitted.get_margeff()

In [34]:
print mdl_margeff.summary()

       MNLogit Marginal Effects      
Dep. Variable:                      y
Method:                          dydx
At:                           overall
       y=B      dy/dx    std err          z      P>|z|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1             0.1129      0.129      0.878      0.380        -0.139     0.365
x2            -0.0145      0.002     -7.763      0.000        -0.018    -0.011
x3             0.0017      0.019      0.091      0.927        -0.036     0.039
x4            -0.0018      0.000     -5.427      0.000        -0.002    -0.001
x5            -2.8506      1.141     -2.499      0.012        -5.086    -0.615
x6            -0.0932      0.728     -0.128      0.898        -1.520     1.334
x7            -0.2837      0.294     -0.966      0.334        -0.859     0.292
x8            -2.6869      1.016     -2.644      0.008        -4.679    -0.695
x9            -0.5792      0.392     -1.476      0.140    