<font size="8">Data Acquisition, Modeling and Analysis: Deep Learning - Final Project</font>\
<font size="6">Written by Alexander M. Pellegrino</font>\
<font size="6">Under Dr. Rensheng Wang</font>\
<font size="6">On March 30th, 2024</font>

In [None]:
import os
import pandas as pd
from keras import Model
from keras.applications import ResNet152V2
from keras.callbacks import ModelCheckpoint
from keras.layers import GlobalAveragePooling2D, Dense
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.resnet_v2 import preprocess_input

# Parses Directory for Cat and Dog Images and Labels Them
def create_image_dataframe(directory):
    files = os.listdir(directory)
    
    data = {
        'file': files,
        'label': ['0' if file.startswith('cat') else '1' for file in files]
    }
    
    return pd.DataFrame(data)

In [None]:
# Load Training Set
training_data = create_image_dataframe('Training_Set')
training_data

In [None]:
# Load Validation Set
validation_data = create_image_dataframe('Validation_Set')
validation_data

In [None]:
# Set up Data Generators for Training and Test Sets
training_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)
validation_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)

# Set up Generator for Training Set
training_generator = training_data_generator.flow_from_dataframe(
    dataframe=training_data,
    directory='Training_Set/',
    x_col='file',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=True
)

In [None]:
# Set up Generator for Validation Set
validation_generator = validation_data_generator.flow_from_dataframe(
    dataframe=validation_data,
    directory='Validation_Set/',
    x_col='file',
    y_col='label',
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=False # Important not to Shuffle Test Set for Validation
)

In [None]:
# Create ResNet152v2 Architecture

# Weights can be None for traditional random weight initialization
# Or 'imagenet' for a pretrained starting point.

# Make sure to set include_top to "False" to strip off the pretrained
# output layer for 'imagenet' and allow creation of a custom one.
base = ResNet152V2(weights=None, include_top=False, input_shape=(224, 224, 3))
x = GlobalAveragePooling2D()(base.output)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=base.input, outputs=output)

In [None]:
# Set Up Checkpoint Saving During Training
checkpoint = ModelCheckpoint(
    'ResNet152V2_Cat_Dog_Classifier_E{epoch:02d}-val_acc{val_accuracy:.4f}.keras',
    monitor='val_accuracy',
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    verbose=False
)

# Train ResNet152v2
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(
    training_generator,
    steps_per_epoch=training_generator.samples // training_generator.batch_size,
    epochs=25,
    validation_data=validation_generator,
    validation_steps=validation_generator.samples // validation_generator.batch_size,
    callbacks=[checkpoint]
)

In [None]:
# Evaluate Final Model
test_loss, test_accuracy = model.evaluate(validation_generator)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

In [None]:
# Set up Data Generator for Final Testing Set
test_data_generator = ImageDataGenerator(preprocessing_function=preprocess_input)

# Set up Generator for Final Testing Set
testing_generator = test_data_generator.flow_from_directory(
    directory='test1/',
    target_size=(224, 224),
    batch_size=32,
    class_mode=None, # We don't know the classes yet
    shuffle=False # Important not to Shuffle Test Set for Validation
)

In [None]:
# Save Results
predictions = model.predict(testing_generator)

results = pd.DataFrame({
    'id': [int(file.split('/')[-1].split('.')[0]) for file in testing_generator.filenames],
    'label': predictions.flatten()
})

results

In [None]:
# Binarize output - low values pushed to 0, high pushed to 1
results['label'] = (results['label'] >= 0.5).astype(int)

In [None]:
# Sorting outputs to match desired format
results.sort_values('id', inplace=True)

# Save to file - don't include extra indexing column
results.to_csv('Prebuilt_ResNet152v2_Predictions.csv', index=False)