In [81]:
# This section is a begining of dealing the intermediate data for the prediction. 
# Here we are dealing with a mixed type of data i.e text and floating values
# The data can be downloaded from https://www.kaggle.com/datasets/ahmad10raza/breast-cancer-data/code

# Aim of this task is to predict the patient status (dead/alive) by using the various parameters/features 
# described in the data

In [82]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

In [83]:
# load the dat using pandas
df = pd.read_csv("breast_cancer_data.csv", sep=",")

In [84]:
# print the header
df

Unnamed: 0.1,Unnamed: 0,Patient_ID,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Date_of_Surgery,Date_of_Last_Visit,Patient_Status
0,0,TCGA-D8-A1XD,36.0,FEMALE,0.080353,0.42638,0.54715,0.273680,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,15-Jan-17,19-Jun-17,Alive
1,1,TCGA-EW-A1OX,43.0,FEMALE,-0.420320,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,26-Apr-17,09-Nov-18,Dead
2,2,TCGA-A8-A079,69.0,FEMALE,0.213980,1.31140,-0.32747,-0.234260,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,08-Sep-17,09-Jun-18,Alive
3,3,TCGA-D8-A1XR,56.0,FEMALE,0.345090,-0.21147,-0.19304,0.124270,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,25-Jan-17,12-Jul-17,Alive
4,4,TCGA-BH-A0BF,56.0,FEMALE,0.221550,1.90680,0.52045,-0.311990,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,06-May-17,27-Jun-19,Dead
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,336,,,,,,,,,,,,,,,,
337,337,,,,,,,,,,,,,,,,
338,338,,,,,,,,,,,,,,,,
339,339,,,,,,,,,,,,,,,,


In [85]:
# you could see different types of columns including floating points (protein1, protein2...age) 
# and categorical values (tumor stage, ER status ..... ) 
# as well as NaN at the end of the file

# Well the categorical values needs to be converted to integers to build the model
# Also the NaN needs to be removed as well.

# Lets begin the work

In [86]:
#droping the NaN
df = df.dropna()

In [87]:
# for now we are not dealing with dates therefor we can remove 
# "Date_of_Surgery" and "Date_of_Last_Visit" and Patient_ID columns
df = df.drop(["Patient_ID", "Date_of_Last_Visit", "Date_of_Surgery"], axis=1)

In [88]:
#droping the first column
df = df.drop(df.columns[0], axis=1)

In [89]:
df

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status
0,36.0,FEMALE,0.080353,0.42638,0.54715,0.273680,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
1,43.0,FEMALE,-0.420320,0.57807,0.61447,-0.031505,II,Mucinous Carcinoma,Positive,Positive,Negative,Lumpectomy,Dead
2,69.0,FEMALE,0.213980,1.31140,-0.32747,-0.234260,III,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Alive
3,56.0,FEMALE,0.345090,-0.21147,-0.19304,0.124270,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Modified Radical Mastectomy,Alive
4,56.0,FEMALE,0.221550,1.90680,0.52045,-0.311990,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Other,Dead
...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,36.0,FEMALE,0.231800,0.61804,-0.55779,-0.517350,III,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Simple Mastectomy,Dead
330,44.0,MALE,0.732720,1.11170,-0.26952,-0.354920,II,Infiltrating Lobular Carcinoma,Positive,Positive,Negative,Other,Dead
331,61.0,FEMALE,-0.719470,2.54850,-0.15024,0.339680,II,Infiltrating Ductal Carcinoma,Positive,Positive,Negative,Lumpectomy,Dead
332,79.0,FEMALE,0.479400,2.05590,-0.53136,-0.188480,I,Infiltrating Ductal Carcinoma,Positive,Positive,Positive,Lumpectomy,Dead


In [90]:
# getting the distinct values from the categorical columns 
print ("Gender", df['Gender'].unique())
print ("Tumour_Stage", df['Tumour_Stage'].unique())
print ("Histology", df['Histology'].unique())
print ("ER status", df['ER status'].unique())
print ("PR status", df['PR status'].unique())
print ("HER2 status", df['HER2 status'].unique())
print ("Surgery_type", df['Surgery_type'].unique())
print ("Patient_Status", df['Patient_Status'].unique())

Gender ['FEMALE' 'MALE']
Tumour_Stage ['III' 'II' 'I']
Histology ['Infiltrating Ductal Carcinoma' 'Mucinous Carcinoma'
 'Infiltrating Lobular Carcinoma']
ER status ['Positive']
PR status ['Positive']
HER2 status ['Negative' 'Positive']
Surgery_type ['Modified Radical Mastectomy' 'Lumpectomy' 'Other' 'Simple Mastectomy']
Patient_Status ['Alive' 'Dead']


In [91]:
# converting the above categorical columns to integers
df['Gender'] = df['Gender'].map({'FEMALE':1, 'MALE':0})

df['Histology'] = df['Histology'].map({'Infiltrating Ductal Carcinoma':1, 'Mucinous Carcinoma':2, 
                                             'Infiltrating Lobular Carcinoma': 3})

df['Tumour_Stage'] = df['Tumour_Stage'].map({'III':3, 'II':2, 'I':1})

df['ER status'] = df['ER status'].map({'Positive':1})

df['PR status'] = df['PR status'].map({'Positive':1})

df['HER2 status'] = df['HER2 status'].map({'Negative':0, 'Positive':1})

df['Surgery_type'] = df['Surgery_type'].map({'Modified Radical Mastectomy':1, 'Lumpectomy':2, 'Other':3, 
                                             'Simple Mastectomy':4})

df['Patient_Status'] = df['Patient_Status'].map({'Alive':1, 'Dead':0})


In [93]:
# write the processed data to a file, can be used for my next examples
df.to_csv("preprocessed.csv", sep=",", index=False, header=True)


In [94]:
# load the preprocessed data
df = pd.read_csv("preprocessed.csv", sep=",")

In [95]:
df

Unnamed: 0,Age,Gender,Protein1,Protein2,Protein3,Protein4,Tumour_Stage,Histology,ER status,PR status,HER2 status,Surgery_type,Patient_Status
0,36.0,1,0.080353,0.42638,0.54715,0.273680,3,1,1,1,0,1,1
1,43.0,1,-0.420320,0.57807,0.61447,-0.031505,2,2,1,1,0,2,0
2,69.0,1,0.213980,1.31140,-0.32747,-0.234260,3,1,1,1,0,3,1
3,56.0,1,0.345090,-0.21147,-0.19304,0.124270,2,1,1,1,0,1,1
4,56.0,1,0.221550,1.90680,0.52045,-0.311990,2,1,1,1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,36.0,1,0.231800,0.61804,-0.55779,-0.517350,3,1,1,1,1,4,0
313,44.0,0,0.732720,1.11170,-0.26952,-0.354920,2,3,1,1,0,3,0
314,61.0,1,-0.719470,2.54850,-0.15024,0.339680,2,1,1,1,0,2,0
315,79.0,1,0.479400,2.05590,-0.53136,-0.188480,1,1,1,1,1,2,0


In [96]:
# lets model it
# list the number of columns, i.e. header
header_names = df.columns.to_list()

In [97]:
X = df[['Age',
 'Gender',
 'Protein1',
 'Protein2',
 'Protein3',
 'Protein4',
 'Tumour_Stage',
 'Histology',
 'ER status',
 'PR status',
 'HER2 status',
 '13']].values

y = df[['Patient_Status']].values

In [102]:
#set up the layer
model = Sequential()
model.add(Dense(13, input_dim=12, activation='relu'))
model.add(Dense(13, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [103]:
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [104]:
X.shape

(317, 12)

In [105]:
model.fit(X, y, epochs=100, batch_size=16)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7f05802f5610>

In [106]:
accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy[1]*100))

Accuracy: 80.44


In [None]:
# Great, we managed to get 80% accuracy which is not bad on small datasets like the one implemented