In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from google.cloud import storage
from urllib.parse import quote
import re

from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

Mounted at /content/drive


In [None]:
!gcloud config set project "ac215-decaide"

Updated property [core/project].


In [None]:
!gcloud auth application-default login


You are running on a Google Compute Engine virtual machine.
The service credentials associated with this virtual machine
will automatically be used by Application Default
Credentials, so it is not necessary to use this command.

If you decide to proceed anyway, your user credentials may be visible
to others with access to this virtual machine. Are you sure you want
to authenticate with your personal account?

Do you want to continue (Y/n)?  Y

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=lCTjkteQU08Bn9iZesJytTv966p840&prompt=consent&token_

# Loading Data

GCP bucket link: https://console.cloud.google.com/storage/browser/ac215-decaide;tab=objects?forceOnBucketsSortingFiltering=true&authuser=1&project=ac215-decaide&supportedpurview=project&prefix=&forceOnObjectsSortingFiltering=false

It takes 6 minutes to run the cell that loads all data from the GCP bucket subfolder

In [None]:
# Load the metadata CSV file
metadata_file = '/content/drive/MyDrive/APCOMP 215/AC_215/data/clean_metadata.csv'
metadata_df = pd.read_csv('clean_metadata.csv')

# Initialize the GCS client and bucket
client = storage.Client()
bucket_name = 'ac215-decaide'
bucket = client.bucket(bucket_name)
folder_path = 'images/clean_data/'
blobs = bucket.list_blobs(prefix=folder_path) # List all blobs in the specified folder

# Lists to hold images and paths
images = []
image_paths = []

# Function to load an image from GCP bucket
def load_image_from_gcp(bucket, image_path):
  img_blob = bucket.blob(image_path)

  # Download the image bytes and decode them
  img_bytes = img_blob.download_as_bytes()
  img = tf.image.decode_image(img_bytes, channels=3)
  img = tf.image.resize(img, (224, 224))
  return img

# Load all images from the specified folder in the bucket
for blob in blobs:
  img_path = blob.name  # Get the full blob name (path)

  try:
    img = load_image_from_gcp(bucket, img_path)
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    images.append(img_array)
    image_paths.append(img_path)
  except Exception as e:
    print(f"Error loading {img_path}: {e}")

In [None]:
# Define batch size
batch_size = 32

# Function to process images in smaller batches
def process_images(image_batch):
    images_tensor = tf.convert_to_tensor(image_batch, dtype=tf.float32)
    # Normalize using the mean and std for ImageNet
    mean = tf.constant([0.485, 0.456, 0.406])
    std = tf.constant([0.229, 0.224, 0.225])
    return (images_tensor - mean) / std

# Load images in smaller batches and process
processed_images = []
for batch in range(0, len(images), batch_size):
    image_batch = images[batch:batch + batch_size]
    processed_images.append(process_images(image_batch))

# Concatenate all processed images
images = tf.concat(processed_images, axis=0)

# Clean and filter metadata_df since there are duplicates
image_paths = [path.replace('images/clean_data/', '') for path in image_paths]
metadata_df = metadata_df.drop_duplicates(subset='filename', keep='first')
metadata_df = metadata_df[metadata_df['filename'].isin(image_paths)]
print(f"Number of images in cleaned/filtered metadata_df: {metadata_df.shape[0]} \n Number of images that exist and were loaded from GCP: {len(image_paths)}")

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(metadata_df['label'].values)
categorical_labels = tf.keras.utils.to_categorical(encoded_labels)

# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((images, categorical_labels))
dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)


Number of images in cleaned/filtered metadata_df: 5554 
 Number of images that exist and were loaded from GCP: 5554


# ResNet50

In [None]:
# Load ResNet50
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(len(label_encoder.classes_), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze all layers except the last 10 -> fine-tune the last 10 layers
for layer in base_model.layers[:-10]:
  layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(dataset, epochs=30)

Epoch 1/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 37ms/step - accuracy: 0.2089 - loss: 4.9285
Epoch 2/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.2998 - loss: 3.2449
Epoch 3/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.3441 - loss: 3.0046
Epoch 4/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 20ms/step - accuracy: 0.3581 - loss: 2.7546
Epoch 5/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.3963 - loss: 2.6222
Epoch 6/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.4138 - loss: 2.3170
Epoch 7/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.4529 - loss: 2.0873
Epoch 8/30
[1m174/174[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.4723 - loss: 2.0081
Epoch 9/30
[1m174/174[0m [32

<keras.src.callbacks.history.History at 0x7d4e7449f910>