## Import the necessary modules

In [None]:
%matplotlib inline
import pandas as pd
import numpy
import scipy
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import seaborn as sns

## Retrieve necessary data and load into Pandas DataFrame

In [None]:
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
names=['code', 'cl_thick', 'c_size', 'c_shape', 'marg_adh', 's_ep_c_size', 
       'b_nuclei', 'b_chromatin', 'n_nucleoli', 'mitoses', 'diag'] 
data = pd.read_csv(url, names=names)
array = data.values

Take an initial look at the data

In [None]:
data.head()

Since the "code" column is just an arbitrary identifier, let's remove it

In [None]:
data.drop('code', axis=1, inplace=True)
data.head()

...and take a look at the shape of the data (699 rows, 10 columns)

In [None]:
data.shape

Now, let's take a deeper look a the data

In [None]:
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
description = data.describe()
print(description)

Did you notice something? The b_nuclei column was excluded. Let's dig a bit to see why.

In [None]:
data.dtypes

It looks like Pandas has identified the b_nuclei column as an 'object' instead of as a number - here's why:

In [None]:
pd.unique(data.b_nuclei.ravel())

The '?'s in that column made it treat the whole column as an object. In order to use this column we will have to convert the
non-numeric values to numeric values and convert this column type to a numeric one. 

In [None]:
#data.b_nuclei.replace(['?'],['-1'],inplace=True)

In [None]:
#Convert the '?' to some easily identified value that can be converted to an integer
data['b_nuclei'] = data['b_nuclei'].apply(lambda x: x if x != '?' else '-1')
#convert the column to an integer
data.b_nuclei = data.b_nuclei.astype('int')

In [None]:
#get the mean of all of the values, EXCEPT for the value that we used in place of the '?'
mean = int(round(data[data.b_nuclei != -1].b_nuclei.mean()))
print('The mean of the column is: ' + str(mean))
data['b_nuclei'] = data['b_nuclei'].apply(lambda x: x if x != -1 else mean)

If we take another look at all of the unique values in the b_nuclei column, we can see that '?' is gone and the data type is an
int64.

In [None]:
pd.unique(data.b_nuclei.ravel())

When we describe the data again, we will see that the b_nuclei is included in the results

In [None]:
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
description = data.describe()
print(description)

The next step is to look a the corrleation beteen the columns and the dianosis to identify which ones should be used to train
our model

In [None]:
#correlations = 
data.corr(method='pearson')
#print(correlations)

This is easier to understand using a graph - in this case a heatmap. Using the scale on the right, we can see that colors that are darker read are closer to 1.0 (meaning a perfect correlation between the the row and the column), colors that are yellower are closer to 0.0 (no correlation), and colors that are darker green are closer to -1.0 (meaning a negative correlation - as one goes up the other goes down).

In [None]:
correlations = data.corr(method='pearson')
sns.heatmap(correlations, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0, linewidths=2.5)

It looks like c_size (0.818), c_shape (0.819), and b_nuclei (0.815) have the strongest correlation with the diagnosis while marg_adh (0.697) and mitoes (0.423) have the least strong correlation (numbers taken from the printed correlation above). 

For now, since this is a small dataset and all of the correlations are "pretty good" we will keep all of them for now and look at the accuracy of our model. 

The next step is to look at the accuracy of our model.

In [None]:
[data[data.diag == 2].shape[0],data[data.diag != 2].shape[0]]

In [None]:
data.hist()
plt.show()

In [None]:
data.plot(kind= 'density' , subplots=True, layout=(5,2), sharex=False)
plt.show()

In [None]:
# calculate the correlations
# coefficients = ['pearson'] #, 'kendall', 'spearman']

# csv_corr = {}

# for coefficient in coefficients:
#     csv_corr[coefficient] = data \
#         .corr(method=coefficient) \
#         .transpose()
        
# print(csv_corr)

In [None]:
# calculate the correlations
#coefficients = ['pearson'] #, 'kendall', 'spearman']

data.corr(method='pearson').transpose()

In [None]:
array = data.values
X = array[:,0:8]
Y = array[:,8]
X[1]
# # feature extraction
# test = SelectKBest(score_func=chi2, k=4)
# fit = test.fit(X, Y)
# # summarize scores
# numpy.set_printoptions(precision=3)
# print(fit.scores_)
# features = fit.transform(X)
# # summarize selected features
# #print(features[0:5,:])