In [70]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import MobileNetV3Large
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from google.cloud import storage
import json, io

ImportError: cannot import name 'TransferManager' from 'google.cloud.storage.transfer_manager' (/opt/homebrew/Caskroom/miniconda/base/envs/creature_vis/lib/python3.10/site-packages/google/cloud/storage/transfer_manager.py)

In [54]:
# Set up GCS client
client = storage.Client()
print(f"GCS client set up: {client}")

GCS client set up: <google.cloud.storage.client.Client object at 0x318b6b7c0>


In [55]:
# Define constants
BUCKET_NAME = 'creature-vision-training-set'
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
DATA_DIR = 'incorrect_predictions'

In [56]:
# Function to load JSON data from GCS
def load_json_from_gcs(bucket_name, file_path):
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(file_path)
    data = json.loads(blob.download_as_string())
    print(f"Loaded JSON from {file_path}")
    return data

In [57]:
# Function to list all files in a GCS directory
def list_gcs_files(bucket_name, prefix):
    bucket = client.get_bucket(bucket_name)
    files = [blob.name for blob in bucket.list_blobs(prefix=prefix)]
    print(f"Found {len(files)} files in {prefix}")
    return files

In [62]:
all_files = list_gcs_files(BUCKET_NAME, DATA_DIR)
list_to_filter = ['stbernard', 'terrier scottish', 'terrier kerryblue', 'terrier westhighland']
json_files = [f for f in all_files if f.endswith('.json') and not any(sub in f for sub in list_to_filter)]
print(f"Total files: {len(all_files)}, JSON files: {len(json_files)}")

Found 3976 files in incorrect_predictions
Total files: 3976, JSON files: 1864


In [59]:
# testing how to filter list of item from list with comprehension syntax
# json_files = [item for item in json_files if not any(sub in item for sub in list_to_filter)]

In [None]:
# init empty dict
labels = {}
# init empty set
all_breeds = set()
# 8 mins to run with ~2000 images in folder
for json_file in json_files:
    data = load_json_from_gcs(BUCKET_NAME, json_file)
    # add corresponding image file
    image_file = json_file.replace('_labels.json', '.jpg')
    # add kv pair to labels dict, k=image file name v=api_label
    labels[image_file] = data['api_label']
    # add bread label to set, duplicates ignored because set
    all_breeds.add(data['api_label'])

TOTAL_BREEDS = len(all_breeds)

In [None]:
def save_labels_and_breeds(labels, all_breeds):
    # Save the labels dictionary to a local JSON file
    with open('tm_labels.json', 'w') as f:
        json.dump(labels, f)
    print("labels dictionary saved to labels.json")

    # Convert all_breeds set to a sorted list
    all_breeds_sorted = sorted(list(all_breeds))

    # Save the all_breeds set to a local JSON file
    with open('tm_all_breeds.json', 'w') as f:
        json.dump(all_breeds_sorted, f)
    print("all_breeds set saved to all_breeds.json")

print(f"Total number of breeds: {len(all_breeds)}")

In [76]:
'''
Suppose all_breeds contains: {"Bulldog", "Affenpinscher", "Chihuahua"}
After sorting: ["Affenpinscher", "Bulldog", "Chihuahua"]
'for i, breed in ...' unpacks each pair yielded by enumerate()
The enumerate() returns k,v pairs, e.g.:
(0, "Affenpinscher")
(1, "Bulldog")
(2, "Chihuahua")
The dictionary comprehension 'breed: i then creates:
{
"Affenpinscher": 0,
"Bulldog": 1,
"Chihuahua": 2
}
'''
breed_to_index = {breed: i for i, breed in enumerate(sorted(all_breeds))}
print(f"Total breeds: {TOTAL_BREEDS}")
print(f"First 5 breed indices: {list(breed_to_index.items())[:5]}")

Total breeds: 155
First 5 breed indices: [('affenpinscher', 0), ('african', 1), ('airedale', 2), ('akita', 3), ('appenzeller', 4)]


In [None]:
# Split data into train and validation sets
from sklearn.model_selection import train_test_split
train_files, val_files = train_test_split(list(labels.keys()), test_size=0.2, random_state=42)
print(f"Train files: {len(train_files)}, Validation files: {len(val_files)}")


In [None]:
# Load your pre-trained model
base_model = MobileNetV3Large(weights='imagenet', include_top=False, input_shape=(*IMAGE_SIZE, 3))
x = layers.GlobalAveragePooling2D()(base_model.output)
x = layers.Dense(256, activation='relu')(x)
output = layers.Dense(TOTAL_BREEDS, activation='softmax')(x)
model = models.Model(inputs=base_model.input, outputs=output)
print("Model created")


In [None]:
# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False
print("Base model layers frozen")

In [None]:
# Function to load images from GCS
def load_image_from_gcs(file_path):
    bucket = client.get_bucket(BUCKET_NAME)
    blob = bucket.blob(file_path)
    image_bytes = blob.download_as_bytes()
    image = tf.image.decode_jpeg(image_bytes, channels=3)
    image = tf.image.resize(image, IMAGE_SIZE)
    print(f"Loaded image: {file_path}")
    return image

In [None]:
import numpy as np
# Custom data generator
def custom_generator(file_paths, batch_size):
    while True:
        batch_paths = np.random.choice(file_paths, batch_size)
        batch_images = [load_image_from_gcs(path) for path in batch_paths]
        batch_images = np.array(batch_images) / 255.0  # Normalize
        batch_labels = [breed_to_index[labels[path]] for path in batch_paths]
        print(f"Generated batch of {batch_size} images")
        yield batch_images, tf.keras.utils.to_categorical(batch_labels, TOTAL_BREEDS)

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print("Model compiled")

In [None]:
# Train the model
print("Starting model training...")
history = model.fit(
    custom_generator(train_files, BATCH_SIZE),
    steps_per_epoch=len(train_files) // BATCH_SIZE,
    validation_data=custom_generator(val_files, BATCH_SIZE),
    validation_steps=len(val_files) // BATCH_SIZE,
    epochs=10
)
print("Model training completed")


In [None]:
# Fine-tuning
for layer in model.layers[-20:]:
    layer.trainable = True
print("Last 20 layers set to trainable for fine-tuning")

model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print("Model recompiled for fine-tuning")

print("Starting model fine-tuning...")
history_fine = model.fit(
    custom_generator(train_files, BATCH_SIZE),
    steps_per_epoch=len(train_files) // BATCH_SIZE,
    validation_data=custom_generator(val_files, BATCH_SIZE),
    validation_steps=len(val_files) // BATCH_SIZE,
    epochs=5
)
print("Model fine-tuning completed")