# Lung Cancer Image Classification

## About the Dataset

### Lung Cancer Image Dataset: A Comprehensive Collection

Explore the intricacies of lung cancer with our curated dataset, consisting of high-resolution CT scan images. This dataset is designed to aid researchers, clinicians, and machine learning/Deep learning enthusiasts in studying the diverse manifestations of lung cancer.

### Key Features

#### CT Scan Images:
Our dataset comprises CT scan images, providing detailed insights into lung cancer morphology. Each image is a visual representation of the complex nature of lung tumors.

#### Split for Comprehensive Analysis:
- **Training Set (613 Images)**: A robust training set containing 613 images meticulously labeled into four distinct classes, allowing for in-depth model training and understanding.
- **Testing Set (315 Images)**: Evaluate the model's performance on a diverse range of 315 images, each belonging to one of the four well-defined lung cancer classes.
- **Validation Set (72 Images)**: A curated validation set of 72 images, essential for fine-tuning models and ensuring generalizability.

### Classes:
- **Class 1**: Adenocarcinoma
- **Class 2**: Large Cell Carcinoma
- **Class 3**: Normal
- **Class 4**: Squamous Cell Carcinoma

Source: https://www.kaggle.com/datasets/kabil007/lungcancer4types-imagedataset

In [None]:
# Import Libraries
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import seaborn as sns
import plotly.express as px
import cv2 
import warnings

import tensorflow as tf
from tensorflow.keras.regularizers import l2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, BatchNormalization,Dense, MaxPool2D, MaxPooling2D, Flatten,GlobalMaxPooling2D, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model, Sequential, Model, load_model
from tensorflow.keras.applications import ResNet50, ResNet101, ResNet152, VGG16, VGG19, EfficientNetB0
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

warnings.filterwarnings("ignore")

In [None]:
# Set parameters
input_size = (224,224) #  Note: pre-trained models were trained on images of this size
batch_size=32

# Import test, train, and validation data
test_data = ImageDataGenerator().flow_from_directory(
    "./archive/Data/test",
    shuffle=False,
    batch_size = batch_size,
    target_size = input_size,
    class_mode = "categorical"
)

class_names = list(test_data.class_indices.keys())

train_data = ImageDataGenerator().flow_from_directory(
    './archive/Data/train',
    shuffle=True,
    batch_size=batch_size,
    target_size = input_size,
    class_mode = "categorical"
)

valid_data = ImageDataGenerator().flow_from_directory(
    "./archive/Data/valid",
    shuffle=False,
    batch_size = batch_size,
    target_size = input_size,
    class_mode = "categorical"
)

## Let's take a look at images from each of the classes

In [None]:
# Get class labels
class_labels = list(test_data.class_indices.keys())

# Function to load four images from each class
def load_images_per_class(data_gen, class_labels, num_images=4):
    images = {label: [] for label in class_labels}
    while any(len(images[label]) < num_images for label in class_labels):
        img_batch, label_batch = next(data_gen)
        for img, label in zip(img_batch, label_batch):
            class_idx = np.argmax(label)
            class_label = class_labels[class_idx]
            if len(images[class_label]) < num_images:
                images[class_label].append(img)
    return images

# Load four images per class
images_dict = load_images_per_class(test_data, class_labels, num_images=4)

# Create subplots
num_classes = len(class_labels)
fig = make_subplots(rows=num_classes, cols=4, subplot_titles=[f"{label} {i+1}" for label in class_labels for i in range(4)])

# Add images to subplots
for class_idx, class_label in enumerate(class_labels):
    for img_idx, img in enumerate(images_dict[class_label]):
        fig.add_trace(
            go.Image(z=img.astype(np.uint8)),
            row=class_idx+1, col=img_idx+1
        )

# Update layout
fig.update_layout(height=300*num_classes, width=1200, title_text="Sample Images from Each Class")

# Show the plot
fig.show()

In [None]:
# Plot the training and validation accuracy for each epoch and show where the highest accuracy & lowest loss are
def plot_accuracy(history):

    # Access the history data
    history_dict = history.history
    
    # Extract metrics
    accuracy = history_dict['accuracy']
    val_accuracy = history_dict['val_accuracy']
    loss = history_dict['loss']
    val_loss = history_dict['val_loss']
    epochs = range(1, len(accuracy) + 1)

    # Find the best validation accuracy and corresponding epoch
    best_val_acc = max(val_accuracy)
    best_val_acc_epoch = val_accuracy.index(best_val_acc) + 1

    # Find the lowest validation loss and corresponding epoch
    lowest_val_loss = min(val_loss)
    lowest_val_loss_epoch = val_loss.index(lowest_val_loss) + 1

    # Plotting the training and validation accuracy
    plt.figure(figsize=(12, 6))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, accuracy, label='Training Accuracy')
    plt.plot(epochs, val_accuracy, label='Validation Accuracy')
    plt.scatter(best_val_acc_epoch, best_val_acc, color='red', label=f'Best Val Accuracy (Epoch {best_val_acc_epoch})')
    plt.text(best_val_acc_epoch, best_val_acc, f'{best_val_acc:.2f}', color='red', ha='right')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.legend()

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, label='Training Loss')
    plt.plot(epochs, val_loss, label='Validation Loss')
    plt.scatter(lowest_val_loss_epoch, lowest_val_loss, color='red', label=f'Lowest Val Loss (Epoch {lowest_val_loss_epoch})')
    plt.text(lowest_val_loss_epoch, lowest_val_loss, f'{lowest_val_loss:.2f}', color='red', ha='right')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
class_names = list(test_data.class_indices.keys())

def plot_confusion_matrix_and_report(model, test_data, class_names, checkpoint_path):

    # Load the model weights from the checkpoint file
    model.load_weights(checkpoint_path)

    # Evaluate the model on the test data
    test_loss, test_accuracy = model.evaluate(test_data)

    # Generate predictions
    y_pred = model.predict(test_data)
    y_pred_classes = y_pred.argmax(axis=-1)
    y_true = test_data.classes

    # Print classification report
    report = classification_report(y_true, y_pred_classes, target_names=class_names)
    print("Classification Report:\n", report)
    
    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred_classes)
    
    # Plot confusion matrix using seaborn
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    
    # Rotate the x-axis labels to 45 degrees
    plt.xticks(rotation=45)
    
    plt.show()

    return test_accuracy, test_loss

# ResNet50, 101, 152
Let's try the different ResNet models. These models increase in complexity. We will use the evaluations of these models to choose additional pre-trained models to check our image sets with. I.e., if ResNet50 outperforms ResNet152, ResNet152 may be too complex resulting in overfitting.

In [None]:
# Let's use the same learning rate for these models
learning_rate = 0.0001  # Note: we will attempt to finetune the learning rate later in the notebook

# Set early stopping and checkpoints
monitor="val_loss"

early_stop = EarlyStopping(
    monitor=monitor,
    patience=10,
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored metric
)

# The checkpoints help in case our computer crashes or our compiling is interupted, but we also want to use the model from the epoch that performed the best.
checkpoint_resnet50 = ModelCheckpoint(
    'resnet50_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

checkpoint_resnet101 = ModelCheckpoint(
    'resnet101_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

checkpoint_resnet152 = ModelCheckpoint(
    'resnet152_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

## ResNet50

In [None]:
resnet50_model = ResNet50( include_top=False, input_shape=(224, 224, 3))
resnet50_model.trainable = False
resnet50_model = Sequential ([
    resnet50_model,
    BatchNormalization(),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])
resnet50_model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_resnet50 = resnet50_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs = 100, 
    callbacks=[early_stop, checkpoint_resnet50],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_resnet50)  

In [None]:
resnet50_acc, resnet50_loss = plot_confusion_matrix_and_report(resnet50_model, test_data, class_names, './resnet50_best.weights.h5')

## ResNet101

In [None]:
resnet101_model = ResNet101( include_top=False, input_shape=(224, 224, 3))
resnet101_model.trainable = False
resnet101_model = Sequential ([
    resnet101_model,
    BatchNormalization(),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])
resnet101_model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_resnet101 = resnet101_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs = 100, 
    callbacks=[early_stop, checkpoint_resnet101],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_resnet101)

In [None]:
resnet101_acc, resnet101_loss = plot_confusion_matrix_and_report(resnet101_model, test_data, class_names, './resnet101_best.weights.h5')

## ResNet152

In [None]:
resnet152_model = ResNet152( include_top=False, input_shape=(224, 224, 3))
resnet152_model.trainable = False
resnet152_model = Sequential ([
    resnet152_model,
    BatchNormalization(),
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])
resnet152_model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_resnet152 = resnet152_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs = 100, 
    callbacks=[early_stop, checkpoint_resnet152],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_resnet152)

In [None]:
resnet152_acc, resnet152_loss = plot_confusion_matrix_and_report(resnet152_model, test_data, class_names, './resnet152_best.weights.h5')

## Let's see which ResNet model performed the best

In [None]:
def plot_comparison(model_names, accuracies, losses):

    # Create a DataFrame from the lists for easy plotting
    data = {
        'Model': model_names,
        'Accuracy': accuracies,
        'Loss': losses
    }
    df = pd.DataFrame(data)

    # Next, create a bar plot for accuracies and losses
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Bar plot for accuracies
    sns.barplot(x='Model', y='Accuracy', data=df, ax=ax1, palette='viridis', alpha=0.7)
    ax1.set_ylabel('Accuracy')
    ax1.set_ylim(0, 1)
    ax1.set_title('Test Data Performance Comparison per Model')

    # Add accuracy values on top of the bars (note, higher accuracy is better)
    for p in ax1.patches:
        ax1.annotate(f'{p.get_height() * 100:.2f}%', 
                     (p.get_x() + p.get_width() / 2., p.get_height()), 
                     ha='center', va='center', 
                     xytext=(0, 9), 
                     textcoords='offset points',
                     color='black', fontsize=12, fontweight='bold')

    # Create a secondary y-axis for losses
    ax2 = ax1.twinx()
    sns.lineplot(x='Model', y='Loss', data=df, ax=ax2, color='red', marker='o', linewidth=2.5)
    ax2.set_ylabel('Loss')
    ax2.set_ylim(0, max(losses) * 1.2)

    # Add loss values above the points on the line plot (note, lower loss is better)
    for line in ax2.lines:
        for x, y in zip(line.get_xdata(), line.get_ydata()):
            ax2.annotate(f'{y:.2f}', 
                         (x, y), 
                         ha='center', va='bottom', 
                         xytext=(0, 5), 
                         textcoords='offset points',
                         color='red', fontsize=10, fontweight='bold')
        
    # Display the plot
    plt.show()

resnet_models = ["ResNet50","ResNet101","ResNet152"]
resnet_accs = [resnet50_acc, resnet101_acc, resnet152_acc]
resnet_loss = [resnet50_loss, resnet101_loss, resnet152_loss]

plot_comparison(resnet_models, resnet_accs, resnet_loss)

## Looks like our ResNet101 Model performed the best out of the ResNet Models!
Let's try VGG16, VGG19,  and EfficientNetB0. We will take the best of the six models and see if we can hypertune some of their parameters. 

In [None]:
# Let's use the same learning rate for these models
learning_rate = 0.0001 

# Set early stopping and checkpoints
monitor="val_loss"

early_stop = EarlyStopping(
    monitor=monitor,
    patience=10,
    restore_best_weights=True  # Restore model weights from the epoch with the best value of the monitored metric
)

# The checkpoints help in case our computer crashes or our compiling is interupted, but we also want to use the model from the epoch that performed the best.
checkpoint_VGG16 = ModelCheckpoint(
    'VGG16_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

checkpoint_VGG19 = ModelCheckpoint(
    'VGG19_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

checkpoint_EfficientNetB0 = ModelCheckpoint(
    'EfficientNetB0_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

## VGG16

In [None]:
VGG16_model = VGG16( include_top=False, input_shape=(224, 224, 3))
VGG16_model.trainable = False
VGG16_model = Sequential([
    VGG16_model,
    BatchNormalization(),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])
VGG16_model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_VGG16 = VGG16_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs = 100, 
    callbacks=[early_stop, checkpoint_VGG16],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_VGG16)

In [None]:
VGG16_acc, VGG16_loss = plot_confusion_matrix_and_report(VGG16_model, test_data, class_names, './VGG16_best.weights.h5')

## VGG19

In [None]:
VGG19_model = VGG19( include_top=False, input_shape=(224, 224, 3))
VGG19_model.trainable = False
VGG19_model = Sequential([
    VGG19_model,
    BatchNormalization(),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])
VGG19_model.compile(optimizer=Adam(learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_VGG19 = VGG19_model.fit(
    train_data, 
    validation_data=valid_data,
    epochs = 100, 
    callbacks=[early_stop, checkpoint_VGG19],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_VGG19)

In [None]:
VGG19_acc, VGG19_loss = plot_confusion_matrix_and_report(VGG19_model, test_data, class_names, './VGG19_best.weights.h5')

## EfficientNetB0

In [None]:
EfficientNetB0_model = EfficientNetB0(include_top=False, input_shape=(224, 224, 3))
EfficientNetB0_model.trainable = False
EfficientNetB0_model = Sequential([
    EfficientNetB0_model,
    BatchNormalization(),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

EfficientNetB0_model.compile(optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'])

history_EfficientNetB0 = EfficientNetB0_model.fit(
    train_data,
    validation_data=valid_data,
    epochs=100,
    callbacks=[early_stop, checkpoint_EfficientNetB0],
    batch_size=batch_size,
    verbose=2
)

In [None]:
plot_accuracy(history_EfficientNetB0)

In [None]:
EfficientNetB0_acc, EfficientNetB0_loss = plot_confusion_matrix_and_report(EfficientNetB0_model, test_data, class_names, './EfficientNetB0_best.weights.h5')

## Let's evaluate our VGGs and EfficientNetB0 models!

In [None]:
bonus_models = ["VGG16","VGG19","EfficientNetB0"]
bonus_accs = [VGG16_acc, VGG19_acc, EfficientNetB0_acc]
bonus_loss = [VGG16_loss, VGG19_loss, EfficientNetB0_loss]

plot_comparison(bonus_models, bonus_accs, bonus_loss)

## Initial Conclusion
### VGG16 and VGG19 perform the best on this lung cancer image data set! 87.62% accuracy is pretty good, but let's see if we can get it up to 90% while either maintaining our 0.41 loss or lowering it as well.
Let's try:
1. Data augmentation
2. Learning rate adjustment
3. Regularization

## Data augmentation for VGG16

In [None]:
# augmented checkpoint to compare to the original VGG16
checkpoint_augmented_VGG16 = ModelCheckpoint(
    'augmented_VGG16_best.weights.h5',
    monitor=monitor,
    save_best_only=True,
    save_weights_only=True,
)

# Set up data augmentation for the training data
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

train_generator = datagen.flow_from_directory(
    './archive/Data/train',
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='categorical'
)

valid_datagen = ImageDataGenerator()  # No augmentation for validation data
valid_generator = valid_datagen.flow_from_directory(
    './archive/Data/valid',
    shuffle=False,
    batch_size = batch_size,
    target_size = input_size,
    class_mode = "categorical"
)

# Train the model
augmented_VGG16_history = VGG16_model.fit(
    train_generator,
    validation_data=valid_generator,
    epochs=100,
    callbacks=[early_stop, checkpoint_augmented_VGG16],
    batch_size=batch_size,
    verbose=2
)

In [None]:
# Plot accuracy for augmented VGG16
plot_accuracy(augmented_VGG16_history)

# Evaluate augmented VGG16 and plot confusion matrix
augment_VGG16_acc, augment_VGG16_loss = plot_confusion_matrix_and_report(VGG16_model, test_data, class_names, './augmented_VGG16_best.weights.h5')

# Compare models
vgg_models = ["VGG16", "VGG16_aug"]
vgg_accs = [VGG16_acc, augment_VGG16_acc]
vgg_loss = [VGG16_loss, augment_VGG16_loss]

plot_comparison(vgg_models, vgg_accs, vgg_loss)

## Adjust the learning rate

In [None]:
def model_learning_rates(lr):
    checkpoint_path  = './VGG16_LR_' + str(lr) + '_best.weights.h5'
    model_checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
    VGG16_reg_model = VGG16(include_top=False, input_shape=(224, 224, 3))
    VGG16_reg_model.trainable = False
    
    VGG16_reg_model = Sequential([
        VGG16_reg_model,
        BatchNormalization(),
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(4, activation='softmax')
    ])

    VGG16_reg_model.compile(optimizer=Adam(learning_rate=lr), loss='categorical_crossentropy', metrics=['accuracy']) 
                                 
    history = VGG16_reg_model.fit(
        train_generator,
        validation_data=valid_generator,
        epochs=100,
        callbacks=[early_stop, model_checkpoint],
        batch_size=batch_size,
        verbose=2
    )
    return VGG16_reg_model, history

# Train the models with different learning rates

learning_rates = [0.00001, 0.001, 0.01] # We already have a model of 0.0001
lr_losses = [VGG16_loss]
lr_accuracies = [VGG16_acc]
lr_titles = ["VGG16_LR_0.0001"]

for lr in learning_rates:
    print(f"Training with learning rate: {lr}")
    model, history = model_learning_rates(lr)
    plot_accuracy(history)
    acc, loss = plot_confusion_matrix_and_report(model, test_data, class_names, './VGG16_LR_' + str(lr) + '_best.weights.h5')
    lr_losses.append(loss)
    lr_accuracies.append(acc)
    lr_titles.append("VGG16_LR_"+str(lr))

In [None]:
plot_comparison(lr_titles, lr_accuracies, lr_losses)

## Regularization
Let's try adding some L2 (ridge) regularization. Regularization can be helpful when you have a complex model with lots of features like we have with our VGG16 model. L2 helps reduce overfitting by penalizing large weights.

Let's try a couple different lambdas:

- L2 Regularization (λ = 0.001): Provides a moderate regularization effect.
- L2 Regularization (λ < 0.001): Weaker regularization, allowing more flexibility.
- L2 Regularization (λ > 0.001): Stronger regularization, reducing the model’s capacity to fit noise in the training data.

In [None]:
def model_with_lambdas(lamb):
    checkpoint_path  = './VGG16_lamda_' + str(lamb) + '_best.weights.h5'
    model_checkpoint = ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True, save_weights_only=True)
    VGG16_reg_model = VGG16(include_top=False, input_shape=(224, 224, 3))
    VGG16_reg_model.trainable = False
    
    VGG16_reg_model = Sequential([
        VGG16_reg_model,
        BatchNormalization(),
        Flatten(),
        Dense(256, activation='relu', kernel_regularizer=l2(lamb)),
        Dropout(0.5),
        Dense(4, activation='softmax')
    ])

    # back to our 0.0001 LR since it performed the best
    VGG16_reg_model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy']) 
                                 
    history = VGG16_reg_model.fit(
        train_generator,
        validation_data=valid_generator,
        epochs=100,
        callbacks=[early_stop, model_checkpoint],
        batch_size=batch_size,
        verbose=2
    )
    return VGG16_reg_model, history

# Train the models with different lambdas
l2_lambdas = [0.0001, 0.001, 0.01]
lambda_losses = []
lambda_accuracies = []
lambda_titles = []

for lam in l2_lambdas:
    print(f"Training with l2 reg: {lam}")
    model, history = model_with_lambdas(lam)
    plot_accuracy(history)
    acc, loss = plot_confusion_matrix_and_report(model, test_data, class_names, './VGG16_lamda_' + str(lam) + '_best.weights.h5')
    lambda_losses.append(loss)
    lambda_accuracies.append(acc)
    lambda_titles.append("VGG16_LR_"+str(lam))


In [None]:
plot_comparison(["VGG16_LR_"+str(0.0001),"VGG16_LR_"+str(0.001),"VGG16_LR_"+str(0.01)], lambda_accuracies, lambda_losses)

# Conclusion
The hyperparameter investigation led to worse performance/overfitting. Therefore...
## Out of the box VGG16 is the best model to use for this lung cancer set!