In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras
from keras.models import Sequential
from keras.layers import Dense

In [20]:
data = pd.read_csv('lungcancer.csv')
data.sample(5)

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
134,M,71,2,2,2,1,2,1,2,2,2,2,1,2,2,YES
68,F,76,2,2,2,2,1,2,2,1,1,1,2,2,2,YES
188,M,65,2,2,2,2,2,1,1,1,1,1,1,1,1,YES
179,F,72,2,2,2,2,1,2,1,2,1,2,2,2,1,YES
123,F,66,2,2,2,1,2,2,2,2,2,2,2,1,1,YES


In [21]:
data.shape

(309, 16)

In [22]:
data.isna().sum()

GENDER                   0
AGE                      0
SMOKING                  0
YELLOW_FINGERS           0
ANXIETY                  0
PEER_PRESSURE            0
CHRONIC DISEASE          0
FATIGUE                  0
ALLERGY                  0
WHEEZING                 0
ALCOHOL CONSUMING        0
COUGHING                 0
SHORTNESS OF BREATH      0
SWALLOWING DIFFICULTY    0
CHEST PAIN               0
LUNG_CANCER              0
dtype: int64

In [26]:
data = data.replace(to_replace='YES', value=1)
data = data.replace(to_replace='NO', value=0)
target = data['LUNG_CANCER']
target

0      1
1      1
2      0
3      0
4      0
      ..
304    1
305    1
306    1
307    1
308    1
Name: LUNG_CANCER, Length: 309, dtype: int64

In [27]:
gender = pd.get_dummies(data['GENDER'])
gender.head()

Unnamed: 0,F,M
0,0,1
1,0,1
2,1,0
3,0,1
4,1,0


In [28]:
features = data.drop(['GENDER','LUNG_CANCER'], axis=1)
features.sample(5)

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
43,52,2,1,1,2,1,2,2,2,2,1,2,1,2
239,75,1,2,2,2,1,1,2,2,1,2,1,2,1
129,63,1,1,1,1,2,2,1,1,1,1,2,1,1
187,55,2,1,1,1,1,2,1,1,1,1,1,1,1
134,71,2,2,2,1,2,1,2,2,2,2,1,2,2


In [29]:
sScaler = StandardScaler()
features = pd.DataFrame(sScaler.fit_transform(features), columns=features.columns, index=features.index)
features.sample(5)

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN
57,1.259833,-1.135292,-1.150351,-0.996769,-1.003241,0.990338,-1.435063,0.892475,-1.12048,0.892475,0.852207,0.748736,1.063501,0.892475
273,-0.448107,-1.135292,0.8693,1.003241,0.996769,0.990338,0.696833,0.892475,0.892475,-1.12048,0.852207,0.748736,1.063501,-1.12048
272,0.161871,0.88083,-1.150351,-0.996769,-1.003241,-1.009756,-1.435063,0.892475,0.892475,0.892475,0.852207,-1.335584,-0.940291,0.892475
232,-0.08212,-1.135292,0.8693,-0.996769,0.996769,0.990338,0.696833,0.892475,0.892475,0.892475,0.852207,-1.335584,1.063501,0.892475
278,1.747816,0.88083,0.8693,1.003241,0.996769,0.990338,0.696833,-1.12048,-1.12048,-1.12048,-1.173424,0.748736,1.063501,-1.12048


In [30]:
processed_features = pd.concat([features, gender], axis=1, sort=False)
processed_features.sample(5)

Unnamed: 0,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,F,M
213,0.161871,0.88083,0.8693,1.003241,0.996769,0.990338,-1.435063,-1.12048,-1.12048,0.892475,-1.173424,-1.335584,1.063501,0.892475,0,1
68,1.625821,0.88083,0.8693,1.003241,0.996769,-1.009756,0.696833,0.892475,-1.12048,-1.12048,-1.173424,0.748736,1.063501,0.892475,1,0
17,-0.692099,0.88083,0.8693,1.003241,0.996769,0.990338,-1.435063,-1.12048,-1.12048,0.892475,-1.173424,-1.335584,1.063501,0.892475,0,1
288,-0.204116,-1.135292,0.8693,1.003241,0.996769,-1.009756,-1.435063,0.892475,0.892475,-1.12048,0.852207,-1.335584,1.063501,-1.12048,1,0
258,0.893846,0.88083,-1.150351,1.003241,-1.003241,-1.009756,0.696833,0.892475,-1.12048,0.892475,0.852207,0.748736,-0.940291,0.892475,0,1


In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_features, target, test_size=0.30, random_state=42)
# one hot encode outputs
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(216, 16)
(93, 16)
(216, 2)
(93, 2)


In [32]:
model = Sequential()
model.add(Dense(16, activation='relu', input_dim=16))
model.add(Dense(8, activation='relu'))
# model.add(Dense(8, activation='relu'))
model.add(Dense(2, activation='softmax'))

In [33]:
# Compile the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [34]:
# build the model
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f807715f0d0>

In [35]:
pred_train= model.predict(X_train)
scores = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy on training data: {}% \n Error on training data: {}'.format(scores[1], 1 - scores[1]))   

pred_test= model.predict(X_test)
scores2 = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores2[1], 1 - scores2[1])) 

Accuracy on training data: 0.9768518805503845% 
 Error on training data: 0.02314811944961548
Accuracy on test data: 0.9569892287254333% 
 Error on test data: 0.04301077127456665
