In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import PIL.Image
import PIL
PIL.Image.MAX_IMAGE_PIXELS = None

# Load the metadata from CSV files
train_metadata = pd.read_csv('train.csv')
test_metadata = pd.read_csv('test.csv')


#Filter for rows where "is_tma" is False
train_metadata = train_metadata[train_metadata['is_tma'] == False]


# Split the training data into training and validation sets
train_data, val_data = train_test_split(train_metadata, test_size=0.1, random_state=42)

# Define the paths to your image files (change to actual)
train_image_paths = ['train_crops' + str(img_id) + '_.png' for img_id in train_data['image_id']]
val_image_paths = ['train_crops' + str(img_id) + '.png' for img_id in val_data['image_id']]
test_image_paths = ['test_images/' + str(img_id) + '.png' for img_id in test_metadata['image_id']]

In [26]:
# Define a function to load and preprocess images
def load_and_preprocess_image(image_path):
    img = Image.open(image_path)
    img = img.resize((224, 224))  # Resize to desired dimensions
    img = np.array(img)  # Convert to numpy array
    # Apply any further preprocessing steps if needed
    return img

# Apply preprocessing to all image paths
train_images = [load_and_preprocess_image(path) for path in train_image_paths]
val_images = [load_and_preprocess_image(path) for path in val_image_paths]
test_images = [load_and_preprocess_image(path) for path in test_image_paths]

# Convert labels to one-hot encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_data['label'])
val_labels = label_encoder.transform(val_data['label'])
# Note: Keep label_encoder for later use in decoding predictions


# Define a data generator for augmentation
train_datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.3,
    horizontal_flip=False,
    vertical_flip=False,
    fill_mode='nearest'
)



# Define a data generator function
def data_generator(images, labels, batch_size, data_augmentation=True):
    while True:
        # Generate random indices for the batch
        indices = np.random.choice(len(images), size=batch_size, replace=False)
        batch_images = []
        batch_labels = []
        
        for idx in indices:
            # Load and preprocess the image
            image = load_and_preprocess_image(images[idx])
            label = labels[idx]
            
            # Apply data augmentation (if enabled)
            if data_augmentation:
                image = train_datagen.random_transform(image)
            
            batch_images.append(image)
            batch_labels.append(label)
        
        yield np.array(batch_images), np.array(batch_labels)

# Define batch size
batch_size = 32

# Create data generators for training and validation sets
train_generator = data_generator(train_image_paths, train_labels, batch_size)
val_generator = data_generator(val_image_paths, val_labels, batch_size, data_augmentation=False)

# Check the data generator
batch_images, batch_labels = next(train_generator)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [9]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Input, AveragePooling2D, Flatten, Dense, Dropout
from tensorflow.keras.models import Model
import tensorflow as tf

In [10]:
rsntBase = ResNet50(weights='imagenet', include_top=False, input_tensor=Input(shape=(224, 224, 3)))

In [11]:
# Adding own classification layers on top
model = tf.keras.Sequential([
    rsntBase,
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6, activation='softmax')
])


In [12]:
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [27]:
# Convert images and labels to numpy arrays


train_images = np.array(train_images)
val_images = np.array(val_images)
test_images = np.array(test_images)
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)

# Normalize pixel values to be between 0 and 1
train_images = train_images / 255.0
val_images = val_images / 255.0
test_images = test_images / 255.0

# Train the model
history = model.fit(train_images, train_labels, epochs=20, validation_data=(val_images, val_labels))




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [30]:
# Make predictions on the test set
predictions = model.predict(test_images)

# Convert the predicted probabilities to class labels
predicted_labels = [np.argmax(prediction) for prediction in predictions]

# Decode the predicted labels using the label_encoder
predicted_subtypes = label_encoder.inverse_transform(predicted_labels)

# Print the predicted subtypes
print(predicted_subtypes)


['HGSC']


In [31]:
# Create a DataFrame with the image IDs and predicted subtypes
submission_df = pd.DataFrame({'image_id': test_metadata['image_id'], 'predicted_subtype': predicted_subtypes})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)