In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sklearn
import sklearn.datasets

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense

from datetime import datetime

Using TensorFlow backend.


In [2]:
class Demo_free_dataset:
    def __init__(self, file_path, demo_size):
        '''
        Splits input csv file into modeling set and demo set
        Returns X and y as dataframes in dictionary
        Saves demo set to csv
        '''
        # Read file and create initial X and y
        diagnosis_df = pd.read_csv(file_path)

        X = diagnosis_df
        y = diagnosis_df['Diagnosis'].map({'M': 1, 'B': 0})

        # Split modeling set and demp set
        X_model, demo_input, y_model, y_demo = train_test_split(X, y, test_size=demo_size)

        # Save demo set
        # demo_input.drop('Diagnosis', axis=1).to_csv('data/demo_input.csv', index=False)

        # Define modeling data set
        self.X = X_model
        self.y = y_model


In [3]:
def filter_columns(dataframe, column_set):
    '''
    Returns new dataframe based on which columns to drop
    from the original dataframe
    '''
    
    # Define which columns to drop
    if column_set == 'full':
        columns_to_drop = ['ID number', 'Diagnosis']
    elif column_set == 'engineered':
        columns_to_drop = [
            'ID number', 'Diagnosis', 
            'Area mean', 'Area SE', 'Area worst', 
            'Concave Points mean', 'Concave Points worst',
            'Perimeter mean', 'Perimeter SE', 'Perimeter worst',
            'Radius worst',
            'Texture worst'
        ]
            
    return dataframe.drop(columns_to_drop, axis=1, inplace=True)

In [4]:
class Train_test_data():
    def __init__(self, X, y, column_set):
        '''
        Fully format data for the Keras model
        '''
        
        # Split
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        
        # Filter columns for X sets, inherit columns set from above
        for dataframe in [X_train, X_test]:
            filter_columns(dataframe, column_set)
               
        # Scale X
        X_scaler = StandardScaler().fit(X_train)
        
        self.X_train_scaled = X_scaler.transform(X_train)
        self.X_test_scaled = X_scaler.transform(X_test)
        
        # Convert y to categorical
        self.y_train_categorical = to_categorical(y_train)
        self.y_test_categorical = to_categorical(y_test)

In [5]:
def train_test_model(train_test_data, column_set):
    '''
    Returns accuracy score from keras tensorflow model
    '''
    if column_set == 'full':
        input_size = 30
    elif column_set == 'engineered':
        input_size = 20

    # Create deep neural network model
    # Use 2 layers, 6 nodes each
    model = Sequential()
    model.add(Dense(units=6, activation='relu', input_dim=input_size))
    model.add(Dense(units=6, activation='relu'))
    model.add(Dense(units=2, activation='softmax'))

    # Fit model to training data
    model.compile(optimizer='adam',
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])

    model.fit(
        train_test_data.X_train_scaled,
        train_test_data.y_train_categorical,
        epochs=1000,
        shuffle=True,
        verbose=0
    )
    
    model_loss, model_accuracy = model.evaluate(
        train_test_data.X_test_scaled, 
        train_test_data.y_test_categorical, 
        verbose=2
    )
    
    return model_accuracy

In [6]:
# Set up file path and iteration numbers here
demo_free_data = Demo_free_dataset('data/diagnosis.csv', 0.02)
iterations = 50

In [7]:
# Compare accuracy with full features and with engineered features
for column_set in ['full', 'engineered']:
    
    accuracy_scores = []
    
    # Run x times to collect accuracy scores
    for i in range(0, iterations):
        
        # Progress check
        timestamp = datetime.strftime(datetime.now(), '%H:%M:%S.%f')
        print(f'Column Set: {column_set}, Iteration: {i}, Time: {timestamp}')
        
        # Randomly split train test data and format for modeling
        tt_data = Train_test_data(demo_free_data.X, demo_free_data.y, column_set)
        # Train model, test, and record accuracy
        accuracy_scores.append(train_test_model(tt_data, column_set))
    
    # Export to csv
    df = pd.DataFrame({'accuracy': accuracy_scores})
    df.to_csv(f'data/{column_set}_features_accuracy_scores.csv', index=False)

Column Set: full, Iteration: 0, Time: 14:25:57.399280


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Column Set: full, Iteration: 1, Time: 14:26:10.056730
Column Set: full, Iteration: 2, Time: 14:26:23.126822
Column Set: full, Iteration: 3, Time: 14:26:36.353796
Column Set: full, Iteration: 4, Time: 14:26:49.646195
Column Set: full, Iteration: 5, Time: 14:27:03.037585
Column Set: full, Iteration: 6, Time: 14:27:16.512598
Column Set: full, Iteration: 7, Time: 14:27:29.971334
Column Set: full, Iteration: 8, Time: 14:27:43.775677
Column Set: full, Iteration: 9, Time: 14:27:57.702344
Column Set: full, Iteration: 10, Time: 14:28:11.676444
Column Set: full, Iteration: 11, Time: 14:28:25.839550
Column Set: full, Iteration: 12, Time: 14:28:40.182256
Column Set: full, Iteration: 13, Time: 14:28:54.632237
Column Set: full, Iteration: 14, Time: 14:29:09.216152
Column Set: full, Iteration: 15, Time: 14:29:23.935422
Column Set: full, Iteration: 16, Time: 14:29:39.083214
Column Set: full, Iteration: 17, Time: 14:29:54.206168
Column Set: full, Iteration: 18, Time: 14:30:09.366870
Column Set: full, I

In [8]:
full_accuracies = pd.read_csv('data/full_features_accuracy_scores.csv')
engineered_accuracies = pd.read_csv('data/engineered_features_accuracy_scores.csv')

In [9]:
full_list = full_accuracies['accuracy'].tolist()
engineered_list = engineered_accuracies['accuracy'].tolist()

In [10]:
compare_df = pd.DataFrame({
    'full': full_list,
    'engineered': engineered_list
})

In [11]:
compare_df.describe()

Unnamed: 0,engineered,full
count,50.0,50.0
mean,0.957842,0.966475
std,0.017259,0.014795
min,0.913669,0.928058
25%,0.944245,0.956835
50%,0.956835,0.967626
75%,0.971223,0.976619
max,0.992806,0.992806
