# Using a SOM to predict cancer type

In [None]:
#First, we import the modules that we're going to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

np.random.seed(0)

print('Setup complete')

In [None]:
#Paste the SOM code I wrote previously

#This function is the learning rate
# @param t expects an int or float value
# @param max_eta expects a float value as the maximum learning rate
def eta(max_eta, t):
    return max_eta/t

#Function for the Gaussian sigma
def sigma(max_sigma, t):
    return max_sigma/t

# @params x,w_m expect 1D arrays
# @out returns euclidean distance of x-w_m
def Euclidean_dist(x,w_m):
    d_vec = x - w_m
    D = np.sqrt(np.dot(d_vec,d_vec))
    return D

# @param sigma_0 expects the max_sigma of the network
# @param t expects a given iteration step
# @param D expects the values of the output nodes
# @param d_min expects the index of the BMU
def h(sigma_0,t,D,d_min):
    I = np.ones(np.shape(D))
    d = np.power((D-I*D[d_min]), 2)
    coef = -1/(2*sigma(sigma_0, t)**2)
    result = np.exp(d*coef)
    return result
    
    
class SOM(object):
    def __init__(self, sigma, eta, size):
        #parameters
        self.sigma = sigma
        self.eta = eta
        # @param size expects a two tuple with the input
        # and output number of neurons
        self.inputLayerSize = size[0]   # X1, X2, X3, ...
        self.outputLayerSize = size[1]  # Y1, Y2, Y3, ...
        
        # build weights of each layer, set to random values
        self.W1 = \
                np.random.rand(self.inputLayerSize, self.outputLayerSize)
                
    #This function finds the Best Matching Unit (BMU)
    # @param x expects the input layer
    # @param W expects the weight matrix     
    def find_nodes_and_BMU(self,x):
        #Calculate distances
        d = []
        for m in range(self.outputLayerSize):
            d.append(Euclidean_dist(x,self.W1[:,m]))
        D = np.array(d, dtype=float)
        m = np.amin(D)
        #Return BMU
        return D, d.index(m)
    
    
    #This function corrects the weights in W1
    def update(self,x,t):
        D, d_min = self.find_nodes_and_BMU(x)
        m = self.outputLayerSize
        H = np.ones((m,m))*h(self.sigma, t, D, d_min)
        X = x.reshape((self.inputLayerSize,1))*np.ones((self.inputLayerSize,m))
        new_W1 = self.W1 + eta(self.eta,t)*np.matmul((X-self.W1), H)
        self.W1 = new_W1
        
    def save_weights(self):
        # save this in order to reproduce our cool network
        np.savetxt("weights.txt", self.W1, fmt="%s")
    
    #This function predicts an output based on an input x and trained weights
    #Returns the index of the BMU
    def predict_output(self,x):
        D, d_min = self.find_nodes_and_BMU(x)
        return d_min
    
    
    
    #Now lets define the training process
    def train(self, data):
        # @param data expects an ndarray or DataFrame
        if str(type(data)) == '<class \'numpy.ndarray\'>':
            n = data.shape[0]
            for i in range(n):
                self.update(data[i,:],i+1)
                
        elif str(type(data)) == '<class \'pandas.core.frame.DataFrame\'>':
            n = data.shape[0]
            new_data = data.to_numpy()
            for i in range(n):
                self.update(new_data[i,:],i+1)
            
        
    def predict(self, data):
        # @param data expects an ndarray or a DataFrame
        if str(type(data)) == '<class \'numpy.ndarray\'>':
            n = data.shape[0]
            out_values = np.zeros((n,1))
            for i in range(n):
                output = self.predict_output(data[i,:])
                out_values[i,0] = output
            result = np.hstack((data,out_values))
            return result
                
        elif str(type(data)) == '<class \'pandas.core.frame.DataFrame\'>':           
            n = data.shape[0]
            new_data = data.to_numpy()
            out_values = np.zeros((n,1))
            for i in range(n):
                output = self.predict_output(new_data[i,:])
                out_values[i,0] = output
            np_result = np.hstack((new_data,out_values))
            result = pd.DataFrame(data = np_result)
            return result
        
print('SOM setup complete')

In [None]:
#Then we import the data
data_path = '../input/breast-cancer-wisconsin-data/data.csv'

data = pd.read_csv(data_path)
data = data.drop(columns='Unnamed: 32')

b_data = data.copy().loc[data.diagnosis == 'B']
m_data = data.copy().loc[data.diagnosis == 'M']

## Size of the samples

In [None]:
print('Benign tumors sample size: ' + str(b_data.shape[0]))
print('\nMalignant tumors sample size: ' +str(m_data.shape[0]))

print('\nTotal data sample size: ' + str(data.shape[0]))

## Data features

In [None]:
print('Data features: ')
for i in range(data.shape[1]):
    print('- ' + data.columns[i])

We can see that there are actually 10 unique features apart from the 'Diagnosis' condition, with three different measured metrics (mean, se, worst).

Let's see if we can use the Id column as an index:

In [None]:
len(data.id.unique())

There are as many different Ids as there are rows, so we can use this column as the new index

In [None]:
data = data.set_index('id')
b_data = b_data.set_index('id')
m_data = m_data.set_index('id')

## Visualization of the data

First we present the first rows of both datasets.

In [None]:
b_data.head(5)

In [None]:
m_data.head(5)

Let's try to find insights by visualizing some features of the datasets.

In [None]:
def plot():
    global fig
    fig = plt.figure(figsize=(13,8))

In [None]:
plot()
sns.distplot(b_data['radius_mean'], label='Benign')
sns.distplot(m_data['radius_mean'], label='Malignant')
plt.legend()
plt.title('Radius Histogram for Benign and Malignant Tumors')

As an example, we can see that the average radius (mean) of the malignant tumors is bigger than the radius of their benign counterparts. So we could include this feature in the input of the SOM.

In [None]:
plot()
sns.distplot(b_data['texture_mean'], label='Benign')
sns.distplot(m_data['texture_mean'], label='Malignant')
plt.legend()
plt.title('Texture Histogram for Benign and Malignant Tumors')

### Radius and Compactness data:

With a scatter plot, we can see if there's a relationship between the radius of a tumor and its compactness (mean):

In [None]:
plot()
sns.scatterplot(data=data, x='radius_mean', y='compactness_mean', hue='diagnosis')
plt.title('Radius (mean) vs. Compactness (mean)')

In [None]:
plot()
sns.regplot(data=data, x='radius_mean', y='compactness_mean')
plt.title('Linear Regression of Radius (mean) vs Compactness (mean)')

Let's check if there are relationships that involve the other measures of Radius and Compactness (se and worst):

In [None]:
plot()
sns.regplot(data=b_data, y='radius_mean', x='radius_se', fit_reg=False, label='Benign')
sns.regplot(data=m_data, y='radius_mean', x='radius_se', logx=True, label='Malignant')
plt.legend()
plt.title('Radius (se) vs Radius (mean)')

In [None]:
trf_data_1 = data.copy()
trf_data_1['radius'] = trf_data_1['radius_mean'] * trf_data_1['radius_se']
trf_data_1['compactness'] = trf_data_1['compactness_mean'] * trf_data_1['compactness_se']
trf_data_1.head()

In [None]:
plot()
sns.scatterplot(x='radius', y='compactness', data=trf_data_1, hue='diagnosis')
plt.title('Trf1 Radius vs Trf1 Compactness')

Let's try with another transformation. In this case we're going to normalize the data and divide the mean by the se:

In [None]:
trf_data_2 = data.copy()
trf_data_2['radius'] = data['radius_mean'] * (1/data['radius_mean'].max())
trf_data_2['compactness'] = data['compactness_mean'] * (1/data['compactness_mean'].max())
#trf_data_2['Compactness (mean)'].max()

In [None]:
plot()
sns.scatterplot(x='radius', y='compactness', data=trf_data_2, hue='diagnosis')
plt.title('Trf2 Radius vs Trf2 Compactness')

### Fractal dimension of a tumor:

In [None]:
plot()
sns.distplot(b_data['fractal_dimension_mean'], label='Benign')
sns.distplot(m_data['fractal_dimension_mean'], label='Malignant')
plt.legend()
plt.title('Fractal Dimension (mean) Histogram')

It doesn't seem to be a good indicator of the type of tumor.

## Predicting the cancer type using a Self Organizing Map

First, we're going to make a train-test split of the data:

(WARNING: Changing the random seed RS may require you to manually re-map the SOM results to the Benign-Malignant classification)

In [None]:
from sklearn.model_selection import train_test_split

RS = 0 #General random state
target = 'diagnosis'
all_features = list(data.columns)
all_features.remove(target)

X_train, X_test, y_train, y_test = train_test_split(data[all_features], data[target], random_state=RS)

### Success metrics

We're going to use the same validation metrics for every attempt so that they can be compared cuantitatively. In this case those metrics are the Mean Absolute Error (MAE), the Mean Squared Error (MSE) and the Accuracy Score.

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score

### Pipeline

In [None]:
# Build, train the SOM and predict the output
def run_SOM(sigma,eta,size,train_data,test_data):
    np.random.seed(RS)
    som = SOM(sigma,eta,size)
    som.train(train_data)
    preds = som.predict(test_data)
    
    retrieve_cols = {0: 0}
    cols = test_data.columns.to_list()
    for i in range(len(cols)):
        retrieve_cols[i] = cols[i]
    retrieve_cols[len(cols)] = 'Predictions'
    preds = preds.rename(axis=1, mapper=retrieve_cols)
    
    return preds

# After the predictions are obtained, we have to format the validation
# data so that we can score the model
def format_data(p,B,M):
    if p == 'B':
        return B
    elif p == 'M':
        return M

# Finally, we use the metrics to see how well the model performed
def score_model(val_data, predictions, model_name):
    mae = mean_absolute_error(val_data, predictions)
    print('MAE Score for %s: ' % model_name)
    print(mae)

    msd = mean_squared_error(val_data, predictions)
    print('\nMSD Score for %s: ' % model_name)
    print(msd)
    
    acc_score = accuracy_score(val_data, predictions)
    print('\nAccuracy Score for %s: ' % model_name)
    print(acc_score)

### Attempt 1:

We can try to use the radius and the compactness of a tumor to guess its diagnosis.

In [None]:
SOM_fts_1_1 = ['radius_mean', 'compactness_mean', 'radius_se', 'compactness_se',
             'radius_worst', 'compactness_worst']

X_train_1_1, X_test_1_1 = X_train[SOM_fts_1_1], X_test[SOM_fts_1_1]

In [None]:
size_1_1 = (6,2)

preds_1_1 = run_SOM(3,1,size_1_1,X_train_1_1,X_test_1_1)
preds_1_1.head()

In [None]:
y_test.head()

So far so good, now we need to map the Bs and Ms in the y_test series to a 1 or a 0.

In [None]:
mapped_test = y_test.apply(func=format_data,args=(1,0))

In [None]:
mapped_test.head()

We're now ready to estimate the accuracy of the model with the success metrics.

In [None]:
score_model(mapped_test, preds_1_1.Predictions, 'SOM 1.1')

This looks pretty good!

In [None]:
#Let's see how well does a Random Forest work in comparisson
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, random_state=RS)
RF.fit(X_train_1_1, y_train.apply(func=format_data, args=(1,0)))
RF_preds = RF.predict(X_test_1_1)
score_model(mapped_test, RF_preds, 'Random Forest')

Althoug a random forest did better.

Now we can try to use the normalized data:

In [None]:
SOM_fts_1_2 = ['radius', 'compactness']

X_train_1_2, X_test_1_2 = X_train.copy(), X_test.copy()

X_train_1_2['radius'] = X_train['radius_mean'] * (1/data['radius_mean'].max())
X_train_1_2['compactness'] = X_train['compactness_mean'] * (1/data['compactness_mean'].max())

X_test_1_2['radius'] = X_test['radius_mean'] * (1/data['radius_mean'].max())
X_test_1_2['compactness'] = X_test['compactness_mean'] * (1/data['compactness_mean'].max())

print(X_train_1_2[['radius_mean','radius','compactness_mean','compactness']].head())
print(X_test_1_2[['radius_mean','radius','compactness_mean','compactness']].head())

In [None]:
#Let's select the features we want
X_train_1_2, X_test_1_2 = X_train_1_2[SOM_fts_1_2], X_test_1_2[SOM_fts_1_2]
#Now, we define the new model we want to test
size_1_2 = (2,2)

preds_1_2 = run_SOM(3,1,size_1_2,X_train_1_2,X_test_1_2)
preds_1_2.head()

In [None]:
mapped_test = y_test.apply(func=format_data, args=(0,1))
score_model(mapped_test, preds_1_2.Predictions, 'SOM 1.2')

It's did slightly better than the SOM 1.1.

### Attempt 2:

In this attempt we're going to use every feature present in the data.

In [None]:
SOM_fts_2_1 = all_features
X_train_2_1, X_test_2_1 = X_train, X_test
size_2_1 = (len(SOM_fts_2_1),2)

preds_2_1 = run_SOM(3,1,size_2_1,X_train_2_1,X_test_2_1)
preds_2_1.head()

In [None]:
mapped_test = y_test.apply(func=format_data, args=(0,1))
score_model(mapped_test, preds_2_1.Predictions, 'SOM 2.1')

This has been the best version so far. The accuracy of a SOM seems to increase with the amount of features we input.

We saw that the fractal dimension (mean) feature didn't show a correlation with the tumor diagnosis. Let's see what happens if we remove it:

In [None]:
SOM_fts_2_2 = all_features.copy()
SOM_fts_2_2.remove('fractal_dimension_mean')

X_train_2_2, X_test_2_2 = X_train[SOM_fts_2_2], X_test[SOM_fts_2_2]
size_2_2 = (len(SOM_fts_2_2),2)

preds_2_2 = run_SOM(3,1,size_2_2,X_train_2_2,X_test_2_2)
preds_2_2.head()

In [None]:
mapped_test = y_test.apply(func=format_data, args=(0,1))
score_model(mapped_test, preds_2_2.Predictions, 'SOM 2.2')

Same score as SOM 2.1.

### Attempt 3
In this case, we'll try to find the optimal parameters for the SOM. Asuming there's a global minima for the accuracy of a SOM based on the succes metrics, first we're going to find the best value for the **sigma** parameter, and then for the **eta** parameter.

We'll be using SOM 2.1 as the model.

In [None]:
X_train_3_1, X_test_3_1 = X_train.copy(), X_test.copy()
size_3_1 = size_2_1
mapped_test = y_test.apply(func=format_data, args=(0,1))
#Let's search for the best value of sigma
for i in [1,2,3,4,5]:
    print('Sigma = %d' % i)
    preds = run_SOM(i,1,size_3_1, X_train_3_1, X_test_3_1)
    score_model(mapped_test, preds.Predictions, 'SOM 3.1')
    print('\n')

The sigma value doesn't seem to affect the results. Let's try with the **eta** parameter:

In [None]:
X_train_3_2, X_test_3_2 = X_train.copy(), X_test.copy()
size_3_2 = size_2_1
mapped_test = y_test.apply(func=format_data, args=(0,1))
#Let's search for the best value of sigma
for i in [0.1,0.2,0.5,1,2]:
    print('Eta = %.1f' % i)
    preds = run_SOM(3,i,size_3_2, X_train_3_2, X_test_3_2)
    score_model(mapped_test, preds.Predictions, 'SOM 3.2')
    print('\n')

It seems to converge to the a score that's very close to the one obtained for SOM 2.1, but this difference could be caused by the small size of the sample. We will asume we already had near optimal values for the parameters.

Finally, we'll do a cross-validation of the SOM 2.1 to get an average score, given the small size of the sample.

In [None]:
from sklearn.model_selection import GroupKFold

def cross_val_SOM(sigma, eta, size, X, y, n_splits=5):
    np.random.seed(RS)
    # First let's generate the data split
    group_kfold = GroupKFold(n_splits=n_splits)
    groups = np.random.randint(low=0, high=n_splits, size=(X.shape[0]), dtype='int64')
    
    scores = []
    
    for train_index, test_index in group_kfold.split(X, y, groups):
        #Generate the split
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        #Build, train and apply the SOM
        predictions = run_SOM(sigma,eta,size,X_train,X_test)
        
        B, M = 1, 0
        mapped_test = y_test.apply(func=format_data, args=(B,M))
        
        #Get the score
        score = accuracy_score(mapped_test, predictions.Predictions)
        if score <= 0.5:
            score = 1 - score    
        scores.append(score)
        
    #Turn the scores to a panda.Series
    scores_pd = pd.Series(data=scores)
    
    print('Mean score: ' + str(scores_pd.mean()))
    print('Max score: ' + str(scores_pd.max()))
    print('Min score: ' + str(scores_pd.min()))

In [None]:
cross_val_SOM(2, 1, size_2_1, X=data[all_features], y=data[target], n_splits=4)

In [None]:
cross_val_SOM(2, 2, size_2_1, X=data[all_features], y=data[target], n_splits=4)

SOM 2.1 seems to have a better mean score than the alternative.