# ANN Assignment - Lung Cancer

In [None]:
# load libraries
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
df = pd.read_csv('./survey lung cancer.csv')
df.head()

In [None]:
# number of rows and columns
df.shape

In [None]:
# data type of each column
df.dtypes

In [None]:
# summary of the data: column names, total no.of non-null values, data types, memory usage
df.info()

In [None]:
# summary statistics
df.describe()

In [None]:
# summary statistics for object data types
df.describe(include=['O'])

In [None]:
# get the count of missing or null values
df.isna().sum()

In [None]:
# count of duplicate rows
df[df.duplicated()].shape

In [None]:
df.duplicated()

In [None]:
# remove duplicate rows from the DataFrame
df = df.drop_duplicates()

In [None]:
df.columns

In [None]:
df['LUNG_CANCER'].value_counts()

In [None]:
# preprocessing provides collection of functions and classes for preprocessing and transforming data before it is used in machine learning models
from sklearn import preprocessing

# using LabelEncoder to convert 'GENDER' and 'LUNG_CANCER' into numbers
label_encoder = preprocessing.LabelEncoder()
df['GENDER'] = label_encoder.fit_transform(df['GENDER'])
df['LUNG_CANCER'] = label_encoder.fit_transform(df['LUNG_CANCER'])

df['GENDER'].value_counts(), df['LUNG_CANCER'].value_counts()

In [None]:
X = df.drop('LUNG_CANCER', axis=1)
y = df['LUNG_CANCER']
X.shape, y.shape

### as we can see we have 238 rows with "YES" and only 38 rows with "NO" values. The model trained on this dataset will inaccurately predict the result for "NO" values. Hence upsampling the dataset using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

# Define the SMOTE oversampler
smote = SMOTE(random_state=42)

In [None]:
# Upsample the data
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
y_resampled.value_counts()

In [None]:
X_resampled.shape, y_resampled.shape

In [None]:
from keras.models import Sequential
from keras.layers import Dense

from keras.initializers import GlorotUniform

In [None]:
# fix random seed for reproducibility
seed = 42
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
# create model
model = Sequential()
model.add(Dense(18, input_dim=15, activation='relu', kernel_initializer=GlorotUniform(seed=seed)))
# model.add(Dense(15, activation='relu', kernel_initializer=GlorotUniform(seed=seed)))
model.add(Dense(12, activation='relu', kernel_initializer=GlorotUniform(seed=seed)))
# model.add(Dense(5, activation='relu', kernel_initializer=GlorotUniform(seed=seed)))
# model.add(Dense(3, activation='relu', kernel_initializer=GlorotUniform(seed=seed)))
model.add(Dense(1, activation='sigmoid', kernel_initializer=GlorotUniform(seed=seed)))

In [None]:
# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit the model
history = model.fit(X_resampled, y_resampled, validation_split=0.30, epochs=70, batch_size=22, shuffle=False)

In [None]:
# evaluate the model
scores = model.evaluate(X_resampled, y_resampled)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
# Visualize training history

# list all data in history
history.history.keys()

In [None]:
# summarize history for accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()