# Load Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
train_path = '/kaggle/input/histopathologic-cancer-detection/train_labels.csv'
test_path = '/kaggle/input/histopathologic-cancer-detection/test.csv'
sample_path = '/kaggle/input/histopathologic-cancer-detection/sample_submission.csv'

# EDA 

## 1. Load and Check the first 5 images

In [None]:
import matplotlib.pyplot as plt
import cv2
import pandas as pd

# Load labels
labels_df = pd.read_csv("/kaggle/input/histopathologic-cancer-detection/train_labels.csv")

# Get the first five image IDs
first_five_ids = labels_df["id"].head(5).tolist()

# Load and display the first five images
fig, axes = plt.subplots(1, 5, figsize=(15, 5))

for i, image_id in enumerate(first_five_ids):
    image_path = f"/kaggle/input/histopathologic-cancer-detection/train/{image_id}.tif"
    
    # Load image
    image = cv2.imread(image_path)
    if image is not None:
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB for correct display

        # Display image
        axes[i].imshow(image)
        axes[i].axis("off")  # Hide axes
        axes[i].set_title(image_id[:6])  # Shortened ID for readability
    else:
        axes[i].text(0.5, 0.5, "Image Not Found", ha='center', va='center', fontsize=12)

plt.tight_layout()
plt.show()


In [None]:

# Define the path to images
image_dir = '/kaggle/input/histopathologic-cancer-detection/train/'

# Get the first 5 image IDs
first_five_ids = labels_df['id'].head(5).tolist()

# Check pixel value ranges for the first 5 images
pixel_stats = []  # Use a list instead of a dictionary

for image_id in first_five_ids:
    image_path = f"{image_dir}{image_id}.tif"
    image = cv2.imread(image_path)

    if image is not None:
        pixel_stats.append({
            "Image ID": image_id,
            "Min Pixel Value": int(image.min()),
            "Max Pixel Value": int(image.max()),
            "Mean Pixel Value": float(image.mean())
        })
    else:
        pixel_stats.append({"Image ID": image_id, "Error": "Image not found"})

# Convert to a DataFrame for better display
pixel_stats_df = pd.DataFrame(pixel_stats)

# Display the result
print(pixel_stats_df)



- The pixels range from 0 to 255, this means the images are not normalized. I will need to normalize the images by dividing by 255 before feeding them into a CNN.
- The images have wide range of brightness. 



## 2. Understand the Label Distribution

In [None]:
import seaborn as sns
sns.countplot(x='label', data=labels_df)
plt.title("Distribution of Labels (No Tumor vs Tumor)")
plt.show()


- The count of images with No Tumor (label 0) is significantly larger than the count of images with Tumor (label 1).
- This suggests that the dataset is imbalanced, with more images that don't contain tumor tissue compared to those that do. This imbalance could be a concern when training models, as the model may become biased towards predicting the majority class.

## 3. Image Shape and Size

In [None]:
# Load one sample image to check its dimensions
sample_image_path = f"/kaggle/input/histopathologic-cancer-detection/train/{labels_df['id'][0]}.tif"
sample_image = cv2.imread(sample_image_path)
print(f"Image shape: {sample_image.shape}")


The image shape of (96, 96, 3) indicates that the images are of size 96x96 pixels with 3 color channels (RGB). 

# CNN Model Training 

## 1. Build CNN

In [None]:
# Build a simple CNN model
model = Sequential()

# Convolutional layer + Max pooling layer
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Another convolutional + max pooling layer
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten the output of the convolutional layers
model.add(Flatten())

# Dense layer with dropout for regularization
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# Output layer with 1 neuron for binary classification
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])


## 2. Train the model

In [None]:
# experimental 
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define paths
data_dir = "/kaggle/input/histopathologic-cancer-detection/train"  # Path to images
csv_path = "/kaggle/input/histopathologic-cancer-detection/train_labels.csv"  # Path to CSV file

# Load CSV file
df = pd.read_csv(csv_path)

# Convert column names to match ImageDataGenerator requirements
df.columns = ["id", "label"]  # Ensure column names are correct

# Convert label column to string (needed for `class_mode='binary'`)
df["label"] = df["label"].astype(str)

# Add '.tif' extension to filenames
df["id"] = df["id"].astype(str) + ".tif"

# Select a small subset 
#df_subset = df.sample(n=20000, random_state=42)  # Use a smaller dataset for faster testing
df_subset = df 

# Define ImageDataGenerator with 80-20 split
datagen = ImageDataGenerator(rescale=1.0 / 255, 
                             validation_split=0.2,
                            )

# Training generator (80%)
train_generator = datagen.flow_from_dataframe(
    dataframe=df_subset,
    directory=data_dir,  # Folder containing all images
    x_col="id",  # Column name for image filenames
    y_col="label",  # Column name for binary labels
    target_size=(96, 96),  # Resize images if needed
    batch_size=32,  
    class_mode="binary",  # Binary classification (0 or 1)
    subset="training",  # Use training split
    shuffle=True
)

# Validation generator (20%)
val_generator = datagen.flow_from_dataframe(
    dataframe=df_subset,
    directory=data_dir,
    x_col="id",
    y_col="label",
    target_size=(96, 96),
    batch_size=32,
    class_mode="binary",
    subset="validation",  # Use validation split
    shuffle=True
)

# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10
)




## 3. Visualize Training History

In [None]:
# Plot training & validation accuracy and loss
plt.figure(figsize=(12, 4))

# Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Accuracy')
plt.legend()
plt.ylim(0,1)

# Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss')
plt.legend()
plt.ylim(0,1)

plt.tight_layout()
plt.show()



# Make Predictions

In [None]:
# Make predictions on the test set (replace 'test_images' with the actual test data)
test_images = np.array([load_and_preprocess_image(image_id) for image_id in test_image_ids][])
predictions = model.predict(test_images)

# If you need binary predictions (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)
