In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from google.cloud import storage
from urllib.parse import quote
import re

from google.colab import drive
drive.mount('/content/drive')

from google.colab import auth
auth.authenticate_user()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!gcloud config set project "ac215-decaide"

Updated property [core/project].


In [None]:
!gcloud auth application-default login


You are running on a Google Compute Engine virtual machine.
The service credentials associated with this virtual machine
will automatically be used by Application Default
Credentials, so it is not necessary to use this command.

If you decide to proceed anyway, your user credentials may be visible
to others with access to this virtual machine. Are you sure you want
to authenticate with your personal account?

Do you want to continue (Y/n)?  Y

Go to the following link in your browser, and complete the sign-in prompts:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fsdk.cloud.google.com%2Fapplicationdefaultauthcode.html&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=1leoFVBQogjT2ssjEQfM7714xajsgV&prompt=consent&token_

# Loading Data

GCP bucket link: https://console.cloud.google.com/storage/browser/ac215-decaide;tab=objects?forceOnBucketsSortingFiltering=true&authuser=1&project=ac215-decaide&supportedpurview=project&prefix=&forceOnObjectsSortingFiltering=false

It takes 6 minutes to run the cell that loads all data from the GCP bucket subfolder

In [None]:
# # Load the metadata
# metadata_file = '/content/drive/MyDrive/APCOMP 215/AC_215/data/clean_metadata.csv'
# metadata_df = pd.read_csv(metadata_file)

# # Load images and labels
# image_paths = []
# images = []
# labels = []

# for filename in metadata_df['filename']:
#     img_path = os.path.join('/content/drive/MyDrive/APCOMP 215/AC_215/data/clean_data', filename)
#     img = tf.keras.preprocessing.image.load_img(img_path, target_size=(224, 224))
#     img_array = tf.keras.preprocessing.image.img_to_array(img)
#     images.append(img_array)
#     image_paths.append(img_path)

# # Convert to tensor
# images = tf.convert_to_tensor(images, dtype=tf.float32)

# # Normalize using the mean and std for ImageNet
# mean = tf.constant([0.485, 0.456, 0.406])
# std = tf.constant([0.229, 0.224, 0.225])
# images = (images - mean) / std

# # Encode labels
# label_encoder = LabelEncoder()
# encoded_labels = label_encoder.fit_transform(metadata_df['label'].values)
# categorical_labels = tf.keras.utils.to_categorical(encoded_labels)

# # Create a TensorFlow Dataset
# dataset = tf.data.Dataset.from_tensor_slices((images, categorical_labels))
# dataset = dataset.shuffle(buffer_size=len(images)).batch(32).prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
# df = pd.read_csv('/content/drive/MyDrive/APCOMP 215/AC_215/data/image_labels.csv')
# df.head()
# print(df.groupby(['label']).count())
# plt.bar(df['label'].unique(), df.groupby(['label']).count().image)
# plt.title('Count of Images by Year')
# plt.show()

In [None]:
# Load the metadata CSV file
metadata_file = '/content/drive/MyDrive/APCOMP 215/AC_215/data/clean_metadata.csv'
metadata_df = pd.read_csv(metadata_file)

# Initialize the GCS client and bucket
client = storage.Client()
bucket_name = 'ac215-decaide'
bucket = client.bucket(bucket_name)
folder_path = 'images/clean_data/'
blobs = bucket.list_blobs(prefix=folder_path) # List all blobs in the specified folder

# Lists to hold images and paths
images = []
image_paths = []

# Function to load an image from GCP bucket
def load_image_from_gcp(bucket, image_path):
  img_blob = bucket.blob(image_path)

  # Download the image bytes and decode them
  img_bytes = img_blob.download_as_bytes()
  img = tf.image.decode_image(img_bytes, channels=3)
  img = tf.image.resize(img, (224, 224))
  return img

# Load all images from the specified folder in the bucket
for blob in blobs:
  img_path = blob.name  # Get the full blob name (path)

  try:
    img = load_image_from_gcp(bucket, img_path)
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    images.append(img_array)
    image_paths.append(img_path)
  except Exception as e:
    print(f"Error loading {img_path}: {e}")

In [None]:
# Convert to tensor
images = tf.convert_to_tensor(images, dtype=tf.float32)

# Normalize using the mean and std for ImageNet
mean = tf.constant([0.485, 0.456, 0.406])
std = tf.constant([0.229, 0.224, 0.225])
images = (images - mean) / std

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(metadata_df['label'].values)
categorical_labels = tf.keras.utils.to_categorical(encoded_labels)

# Clean and filter metadata_df since there are duplicates, and there are 5947 entries, but only 3336 images exist and were loaded from GCP
image_paths = [path.replace('images/clean_data/', '') for path in image_paths]
metadata_df = metadata_df.drop_duplicates(subset='filename', keep='first')
metadata_df = metadata_df[metadata_df['filename'].isin(image_paths)]
print(f"Number of images in cleaned/filtered metadata_df: {metadata_df.shape[0]} \n Number of images that exist and were loaded from GCP: {len(image_paths)}")

# Create a TensorFlow Dataset
dataset = tf.data.Dataset.from_tensor_slices((images, categorical_labels))
dataset = dataset.shuffle(buffer_size=len(images)).batch(32).prefetch(tf.data.experimental.AUTOTUNE)

Number of images in cleaned/filtered metadata_df: 3336 
 Number of images that exist and were loaded from GCP: 3336


# ResNet50

In [None]:
# Load ResNet50
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(len(label_encoder.classes_), activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze all layers except the last 10 -> fine-tune the last 10 layers
for layer in base_model.layers[:-10]:
  layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(dataset, epochs=10)

Epoch 1/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 133ms/step - accuracy: 0.1527 - loss: 31.4988
Epoch 2/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 94ms/step - accuracy: 0.3382 - loss: 8.4209
Epoch 3/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 94ms/step - accuracy: 0.3892 - loss: 7.1259
Epoch 4/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.4200 - loss: 7.4833
Epoch 5/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 94ms/step - accuracy: 0.4704 - loss: 7.1532
Epoch 6/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 94ms/step - accuracy: 0.4555 - loss: 8.0584
Epoch 7/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 92ms/step - accuracy: 0.5134 - loss: 6.2052
Epoch 8/10
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 95ms/step - accuracy: 0.5107 - loss: 6.5880
Epoch 9/10
[1m105/105

<keras.src.callbacks.history.History at 0x7c84e0e0faf0>

In [None]:
# # Split features (images) and labels
# X = df.drop('label', axis=1).values
# y = df['label'].values

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Convert labels to one-hot encoding
# label_encoder = LabelEncoder()
# y_train_encoded = tf.keras.utils.to_categorical(label_encoder.fit_transform(y_train))
# y_test_encoded = tf.keras.utils.to_categorical(label_encoder.transform(y_test))


In [None]:
# # Define CNN model
# model = models.Sequential([
#     layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224,224,3)),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(64, (3, 3), activation='relu'),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(64, (3, 3), activation='relu'),
#     layers.Flatten(),
#     layers.Dense(64, activation='relu'),
#     layers.Dense(len(label_encoder.classes_), activation='softmax')
# ])

# # Compile the model
# model.compile(optimizer='adam',
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])

# # Train the model
# history = model.fit(X_train, y_train_encoded, epochs=10, validation_data=(X_test, y_test_encoded))

# # Evaluate the model
# test_loss, test_acc = model.evaluate(X_test, y_test_encoded)
# print('Test accuracy:', test_acc)