Lungs Cancer ? 

Problem Satement 

About Data Sets 

Aims and Objective 

In [None]:
# Essential Library

In [None]:
import os 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
#Sklearn -libraries 
from sklearn.model_selection import train_test_split
#To balance datasets 
from sklearn.utils import class_weight
#Open CV
import cv2
#Tensorflow and keras Library
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.applications import InceptionV3
from tesnorflow.keras.applications import VGG16
import warnings
warnings.filterwarnings('ignore')

In [None]:
bengin_path = r'./NIDCH-Dhaka-Lungs-Cancer-Datasets/Dataset/Bengin cases'
malignant_path = r'./NIDCH-Dhaka-Lungs-Cancer-Datasets/Dataset/Malignant cases'
normal_path = r'./NIDCH-Dhaka-Lungs-Cancer-Datasets/Dataset/Normal cases'

img_classes = ["Bengin cases", "Malignant cases", "Normal cases"] # Categories 
path_list = [bengin_path, malignant_path, normal_path]

In [None]:
img_path = []
class_labels = []
for i, dir_list in enumerate(path_list):
    name_img = os.listdir(dir_list)
    for name_file in name_img:
        img = os.path.join(dir_list,name_file)
        img_path.append(img)
        class_labels.append(img_classes[i])

df = pd.DataFrame({"img_path" : img_path,
                  "label" : class_labels})

df.head()


In [None]:
df.describe()

In [None]:
for category, group in df.groupby("label"):
    fig, ax = plt.subplots(1,3, figsize = (7,7))
    ax = ax.ravel()
    for i, (_,r) in enumerate(group.sample(3).iterrows()):
        img = cv2.imread(r.img_path)
        ax[i].imshow(img)
        ax[i].axis("off")
        ax[i].set_title(r.label)
    plt.show()

#### Exploratory Data Analysis 

In [None]:
countData = df["label"].value_counts().reset_index()
fig = px.histogram(data_frame = countData, x = "label", y = "count", width=600,
    height=400,
    title="Count of Labels by Category")
fig.show()

#### Analyzing the Image properties 

In [None]:
sizes = []
resolutions = []
color_distributions = []

for img_path in df ["img_path"]:
    # load image 
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    #code to get image size
    size = os.path.getsize(img_path)
    sizes.append(size)
    #Extract the resolutions of images
    resolution = img.shape[:2]
    resolutions.append(resolution)
    #Extract color distribution
    mean_color_distributions = np.bincount(img.flatten(), minlength= 256)
    color_distributions.append(mean_color_distributions)
sizes = np.array(sizes)
resolutions = np.array(resolutions)
color_distributions = np.array(color_distributions)

#### Distributions Size of image

In [None]:
#Image sizes in MB
sizes_MB = []
for img_path in df["img_path"]:
    #code to load image
    img = cv2.imread(img_path)
    #code to get imge size 
    size = os.path.getsize(img_path)
    sizes_MB.append(size/1_00_000)

fig = px.histogram(x=sizes_MB, nbins = 50, title = "Distribution size of Image", width=600,
    height=400)
fig.update_layout(xaxis_title = "File Size (MB)",
                  yaxis_title = "Number of Images",
                  showlegend = False,
                  bargap = 0.1,
                  bargroupgap = 0.1)
fig.update_traces(marker = dict(color="green"))
fig.show()

#### Distributions of image Resulations 

In [None]:
fig = px.scatter(x = resolutions[:,0],
                 y = resolutions[:,1], 
                 title = "Distribution of Image Resolution", height= 400, width= 600)
fig.update_layout(
    xaxis_title = "Width (Pixel)",
    yaxis_title = "Height (Pixel)",
    showlegend = False,
    hovermode = "closest"
)
fig.update_traces(marker = dict(color="red"))
fig.show()

### Mean Color Distributins 

In [None]:
import plotly.graph_objects as go
mean_color_distributions = np.mean(color_distributions, axis = 0)
fig = go.Figure(
    go.Bar(x = np.arange(256), y = mean_color_distributions, name = "Mean Color Distributions"
))
fig.update_layout(
    title = "Mean Color Distribution",
    xaxis_title = "Color Values",
    yaxis_title = "Number of Pixel"
)
fig.show()

#### Train & test 

In [None]:
train_ratio = 0.70
test_ratio = 0.15
val_ratio = 0.15

df_train, df_test_val = train_test_split(df, train_size = train_ratio, random_state = 42)
df_test, df_val = train_test_split(df_test_val, train_size = test_ratio/(test_ratio + val_ratio), random_state = 42)

print(f"Train shape = {df_train.shape}")
print(f"Test shape = {df_test.shape}")
print(f"Validation shape = {df_val.shape}")

#### DeNoise image using median blur

In [None]:
def preprocessing_denoise(img):
    denoise_img = cv2.medianBlur(img, 1)
    denoise_img = cv2.cvtColor(denoise_img, cv2.COLOR_BGR2RGB)
    return denoise_img

#### Data Agrumendations using ImageDataGenerator


In [None]:
IMG_WIDTH = 256
IMG_HEIGHT = 256

image_size = (IMG_WIDTH, IMG_HEIGHT)
batch_size = 32

TRAIN_DATAGEN = ImageDataGenerator(rescale = 1./255.,
                                   preprocessing_function = preprocessing_denoise,
                                  rotation_range = 15,
                                  width_shift_range = 0.1,
                                  height_shift_range = 0.1,
                                  shear_range = 0.1,
                                  zoom_range = 0.2,
                                  horizontal_flip = True,
                                  )


TEST_DATAGEN = ImageDataGenerator(rescale = 1./255.)

#for training image 
train_generator = TRAIN_DATAGEN.flow_from_dataframe(
    df_train,
    x_col = "img_path",
    y_col = "label",
    traget_size = image_size,
    batch_size = batch_size,
    color_mode = 'rgb', 
    class_mode = "categorical",
    shuffle = True
)

# for testing image 
test_generator = TEST_DATAGEN.flow_from_dataframe(
    df_test,
    x_col = "img_path",
    y_col = "label",
    traget_size = image_size,
    batch_size = batch_size,
    color_mode = 'rgb', 
    class_mode = "categorical",
    shuffle = True
)

val_generator = TEST_DATAGEN.flow_from_dataframe(
    df_val,
    x_col = "img_path",
    y_col = "label",
    traget_size = image_size,
    batch_size = batch_size,
    color_mode = 'rgb', 
    class_mode = "categorical",
    shuffle = True
)


#### Class weight 

In [None]:
#The datasets is hightly imblace so we used Class weight for balancing the datasets 

#image categories
classes = list(train_generator.class_indices.keys())

class_weights = class_weight.compute_class_weight(
           class_weight = 'balanced',
            classes = np.unique(train_generator.classes),
            y = train_generator.classes)
train_class_weights = dict(enumerate(class_weights))

#classes = list(train_generator.class_indices.keys())
for idx, weight, in train_class_weights.items():
    class_name = classes[idx]
    print(f"{class_name} : {weight}")

#### Model  Implementations #3D-CNN Model 

In [None]:

model_2D = Sequential([
    # Conv Block 1
    Conv2D(64, (3,3), activation='relu', padding='same', input_shape=(256,256, 3)),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Conv Block 2
    Conv2D(64, (3,3), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Conv Block 3
    Conv2D(128, (3,3), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Conv Block 4
    Conv2D(128, (3,3), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Conv Block 5
    Conv2D(256, (3,3), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Conv Block 6
    Conv2D(256, (3,3), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2,2)),
    BatchNormalization(),

    # Global Average Pooling
    GlobalAveragePooling2D(),

    # Fully Connected Layers
    Dense(256, activation='relu'),
    Dropout(0.25),

    Dense(3, activation='softmax')  # Assuming 3 classes
]) 

model_2D.summary()


In [None]:
model_2D.compile(
    loss = "categorical_crossentropy",
    optimizer = Adam(learning_rate = 0.0003),
    metrics = ["accuracy"],
)
epochs = 50
history = model_2D.fit(train_generator,
                   steps_per_epoch = len(train_generator),
                   batch_size = 32,
                   validation_data = val_generator,
                   validation_steps = len(val_generator),
                   class_weight = train_class_weights,
                   callbacks=[
                               EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True),
                               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, mode='min')
                              ],
                   epochs = epochs)

##### Training & validation Acuracy loss

In [None]:
def history_plot(epochs, history):
    fig1 = make_subplots()
    fig1.add_trace(go.Scatter(x = np.arange(1,epochs+1), y = history.history["accuracy"], name = "Training Accuracy"))
    fig1.add_trace(go.Scatter(x = np.arange(1,epochs+1), y = history.history["val_accuracy"], name = "Validation Accuracy"))
    fig1.update_layout(title = "Training and Validation Accuracy", xaxis_title = "Epoch", yaxis_title = "Accuracy")
    fig1.show()

    fig2 = make_subplots()
    fig2.add_trace(go.Scatter(x = np.arange(1,epochs+1), y = history.history["loss"], name = "Training Loss"))
    fig2.add_trace(go.Scatter(x = np.arange(1,epochs+1), y = history.history["val_loss"], name = "Validation Loss"))
    fig2.update_layout(title = "Training and Validation Loss", xaxis_title = "Epoch", yaxis_title = "Loss")
    fig2.show()

history_plot(epochs, history)

In [None]:
def evaluate_model(model, test_generator):
    #Calculate test loss and accuracy
    results = model.evaluate(test_generator, verbose = 0)
    print(f"Test Loss = {results[0]}")
    print(f"Test Accuracy = {results[1]}")
evaluate_model(model_2D, test_generator)

#### Transfer learning for improving Accuray (using Fine tune Model Vgg16 and InceptionsV3)

In [None]:
#Define base_model of InceptionV3
base_model = InceptionV3(input_shape = (256,256,3), include_top = False, weights = "imagenet")
#Freeze all layers
for layer in base_model.layers:
    layer.trainable = False

#Add Custom layers
model_IV3 = Sequential()
model_IV3.add(Input(shape = (256,256, 3)))
model_IV3.add(base_model)
model_IV3.add(GlobalAveragePooling2D())
model_IV3.add(Dense(1024, activation = "relu"))
model_IV3.add(Dropout(0.4))
model_IV3.add(Dense(3, activation = "softmax"))
#Compile and Training the model 
epochs = 50
model_IV3.compile(optimizer = Adam(0.0002),
             loss = "categorical_crossentropy",
             metrics = ["accuracy"])
history = model_IV3.fit(train_generator,
                   steps_per_epoch = len(train_generator),
                   batch_size = 64,
                   validation_data = val_generator,
                   validation_steps = len(val_generator),
                   class_weight = train_class_weights,
                   callbacks=[
                               EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True), 
                               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, mode='min') 
                              ],
                   epochs = epochs)



In [None]:
#Plot the graph
history_plot(epochs, history)

In [None]:
evaluate_model(model_IV3, test_generator)

### Fine tune InceptionsV3 Model

In [None]:
# All layers does not freeze
base_model.trainable = True
#Compile the model
model_IV3.compile(optimizer = Adam(0.0001),
              loss = "categorical_crossentropy",
              metrics = ["accuracy"])
#Training the model
epochs = 50
history = model_IV3.fit(train_generator,
                   steps_per_epoch = len(train_generator),
                   batch_size = 64,
                   validation_data = val_generator,
                   validation_steps = len(val_generator),
                   class_weight = train_class_weights,
                   callbacks=[
                               EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True), 
                               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, mode='min') 
                              ],
                   epochs = epochs)

In [None]:
#Plot the graph
history_plot(epochs, history)

In [None]:
evaluate_model(model_IV3, test_generator)

### Finetune VGG16 Model 

In [None]:
#Define base mode of VGG16
base_model_vgg16 = VGG16(input_shape = (256,256, 3), include_top = False, weights = "imagenet")

#Freeze all layers VGG16 model
for layer in base_model_vgg16.layers:
    layer.trainable = False

#Add custom layers
model_VGG16 = Sequential()
model_VGG16 .add(Input(shape = (256, 256, 3)))
model_VGG16 .add(base_model)
model_VGG16 .add(GlobalAveragePooling2D())
model_VGG16 .add(Dense(1024, activation = "relu"))
model_VGG16 .add(Dropout(0.4))
model_VGG16 .add(Dense(3, activation = "softmax"))

#Compile model
model_VGG16 .compile(optimizer = Adam(0.0002),
             loss = "categorical_crossentropy",
             metrics = ["accuracy"])

#Training the model
epochs = 50
history = model_VGG16 .fit(train_generator,
                   steps_per_epoch = len(train_generator),
                   batch_size = 64,
                   validation_data = val_generator,
                   validation_steps = len(val_generator),
                   class_weight = train_class_weights,
                   callbacks=[
                               EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True), 
                               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, mode='min') 
                              ],
                   epochs = epochs)

In [None]:
history_plot(epochs, history)

In [None]:
evaluate_model(model_VGG16 ,test_generator)

In [None]:
#Set the training layer jus block_conv1
base_model_vgg16.trainable = True
set_trainable = False
for layer in base_model_vgg16.layers:
    if layer.name == 'block5_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False

#Compile the model
model_VGG16 .compile(optimizer = Adam(0.0001),
              loss = "categorical_crossentropy",
              metrics = ["accuracy"])   
#Training the model
epochs = 50
history = model_VGG16.fit(train_generator,
                   steps_per_epoch = len(train_generator),
                   batch_size = 64,
                   validation_data = val_generator,
                   validation_steps = len(val_generator),
                   class_weight = train_class_weights,
                   callbacks=[
                               EarlyStopping(monitor = "val_loss", # watch the val loss metric
                               patience = 5,
                               restore_best_weights = True), 
                               ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, mode='min') 
                              ],
                   epochs = epochs)

In [None]:
history_plot(epochs,history)

In [None]:
evaluate_model(model_VGG16, test_generator)

In [None]:
#Results of the model
models = [model_2D, model_IV3, model_VGG16 ]
loss = []
accuracy = []
for model in models:
    results = model.evaluate(test_generator, verbose = 0)
    loss.append(results[0])
    accuracy.append(results[1])

name_models = ["CNN (Custom)", "InceptionV3", "VGG16"]
df_loss_acc = pd.DataFrame(data = {"Name_Models" : name_models,
                                  "Loss" : loss,
                                  "Accuracy" : accuracy})

fig = px.bar(data_frame = df_loss_acc, x = "Name_Models", y = ["Accuracy", "Loss"],
            barmode = "group",
            text_auto = ".3f")
fig.show()