# Build a Multi-Layer Perceptron and train it to classify masses as benign or malignant based on its features

In [2]:
import pandas as pd

masses_df = pd.read_csv('mammographic_masses.data.txt')
masses_df.head()

Unnamed: 0,5,67,3,5.1,3.1,1
0,4,43,1,1,?,1
1,5,58,4,5,3,1
2,4,28,1,1,3,0
3,5,74,1,5,?,1
4,4,65,1,?,3,0


### Convert missing data "?" into a NaN and add column names

In [3]:
masses_df = pd.read_csv('mammographic_masses.data.txt', na_values = ['?'], names = ["BI-RADS", "age", "shape", "margin", "density", "severity"] )
masses_df.head()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1


In [4]:
print(masses_df.describe())
masses_df.shape  # Shape of data

          BI-RADS         age       shape      margin     density    severity
count  959.000000  956.000000  930.000000  913.000000  885.000000  961.000000
mean     4.348279   55.487448    2.721505    2.796276    2.910734    0.463059
std      1.783031   14.480131    1.242792    1.566546    0.380444    0.498893
min      0.000000   18.000000    1.000000    1.000000    1.000000    0.000000
25%      4.000000   45.000000    2.000000    1.000000    3.000000    0.000000
50%      4.000000   57.000000    3.000000    3.000000    3.000000    0.000000
75%      5.000000   66.000000    4.000000    4.000000    3.000000    1.000000
max     55.000000   96.000000    4.000000    5.000000    4.000000    1.000000


(961, 6)

### Check to see if dropping the data is an option. 
    Does the missing data appear to be randomly distributed? 
    Would it introduce bias? 
    Check to see if there are any correlations with data that has missing fields. 
        If there are, then may need to fill that data in. 


In [21]:
masses_df.loc[(masses_df["BI-RADS"].isnull()) |
             (masses_df["age"].isnull()) |
             (masses_df["shape"].isnull()) | 
             (masses_df["margin"].isnull()) |
             (masses_df["density"].isnull())].head(60)


Unnamed: 0,BI-RADS,age,shape,margin,density,severity
1,4.0,43.0,1.0,1.0,,1
4,5.0,74.0,1.0,5.0,,1
5,4.0,65.0,1.0,,3.0,0
6,4.0,70.0,,,3.0,0
7,5.0,42.0,1.0,,3.0,0
9,5.0,60.0,,5.0,1.0,1
12,4.0,64.0,1.0,,3.0,0
19,4.0,40.0,1.0,,,0
20,,66.0,,,1.0,1
22,4.0,43.0,1.0,,,0


### Make copy of the data, then drop missing values for now


In [5]:
mm_df = masses_df.copy()
mm_df

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
0,5.0,67.0,3.0,5.0,3.0,1
1,4.0,43.0,1.0,1.0,,1
2,5.0,58.0,4.0,5.0,3.0,1
3,4.0,28.0,1.0,1.0,3.0,0
4,5.0,74.0,1.0,5.0,,1
...,...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0,0
957,4.0,56.0,4.0,5.0,3.0,1
958,4.0,64.0,4.0,5.0,3.0,0
959,5.0,66.0,4.0,5.0,3.0,1


In [6]:
mm_df.dropna(inplace = True)
mm_df.describe()

Unnamed: 0,BI-RADS,age,shape,margin,density,severity
count,830.0,830.0,830.0,830.0,830.0,830.0
mean,4.393976,55.781928,2.781928,2.813253,2.915663,0.485542
std,1.888371,14.671782,1.242361,1.567175,0.350936,0.500092
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,55.0,96.0,4.0,5.0,4.0,1.0


### Before passing to scikitlearn to do modeling, convert Pandas dataframes into NumPy arrays, so that it can be used by scikitlearn
    Need to create an array that extracts only the feature data that we will work with, namely 
        (age,shape,margin, density)
    Need to create an array that contains the classes or severity
    Need to create an array of the feature name labels
    

In [7]:
features = mm_df[["age","shape","margin","density"]].values
classes = mm_df["severity"].values
feature_names = ["age","shape","margin","density"]
features

array([[67.,  3.,  5.,  3.],
       [58.,  4.,  5.,  3.],
       [28.,  1.,  1.,  3.],
       ...,
       [64.,  4.,  5.,  3.],
       [66.,  4.,  5.,  3.],
       [62.,  3.,  3.,  3.]])

## Normalize Data
Notice theres a large range in the data between the age range and the other columns. If this data was used as is, the age would have a bigger weight on the results than anything else

#### Make sure everything is centered to the mean for each column and scale down to the same range so that they have the same weight.

In [8]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
features_scaled = scaler.fit_transform(features)
features_scaled

array([[ 0.7650629 ,  0.17563638,  1.39618483,  0.24046607],
       [ 0.15127063,  0.98104077,  1.39618483,  0.24046607],
       [-1.89470363, -1.43517241, -1.157718  ,  0.24046607],
       ...,
       [ 0.56046548,  0.98104077,  1.39618483,  0.24046607],
       [ 0.69686376,  0.98104077,  1.39618483,  0.24046607],
       [ 0.42406719,  0.17563638,  0.11923341,  0.24046607]])

## Use TensorFlow's Keras API to Create a Neural Network 
#### It will learn from the data and create a neural network that can predict if a mass that it hasnt seen before is benign or malignant

In [9]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

def create_model():
    model = Sequential()
    # There will be 4 feature inputs going into 6-unit layer (can also try 4)
    model.add(Dense(6,input_dim = 4, kernel_initializer = "normal",activation = "relu"))
    # Additional Deep Layer doesnt help can remove
    # model.add(Dense(6,input_dim = 4, kernel_initializer = "normal",activation = "relu"))
    # Output layer wih a binary classification (benign or malignant)
    model.add(Dense(1,kernel_initializer = "normal", activation = "sigmoid"))
    # Compile model
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

    

In [10]:
from sklearn.model_selection import cross_val_score
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier


# Wrap Keras model in an estimator compatible with scikit_learn
estimator = KerasClassifier(build_fn = create_model, epochs = 100, verbose = 0)
# Use scikit_learn's cross_val_score to evaluate this model identically to the others
    # cross_val randomly separates the dataset into training and test datasets multiple times. 
    # It will train the model or neural network set multiple times and evaluate the model with the test dataset on data that it has never seen before
    # In this case we will do it 10 times and average out the results from each different split of training and testing data
cv_scores = cross_val_score(estimator,features_scaled, classes, cv = 10)
cv_scores.mean()



0.8024096369743348