In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files, drive
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

# 1. Mount Drive and setup Path
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/MyDrive/Flickr_Project/'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# 2. Dataset Download
if not os.path.exists('kaggle.json'):
    files.upload() # Upload your kaggle.json here
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d adityajn105/flickr8k
!unzip -q flickr8k.zip -d flickr_data

# 3. Process Captions
df = pd.read_csv('flickr_data/captions.txt')
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char.isalpha() or char.isspace()])
    text = " ".join(text.split())
    return f"startseq {text} endseq"

df['caption'] = df['caption'].apply(clean_text)

# 4. Create Tokenizer (The Dictib onary)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['caption'].tolist())
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(c.split()) for c in df['caption'].tolist())

# 5. Save Tokenizer immediately
with open(SAVE_PATH + 'tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print(f"✅ Dataset Ready! Vocab Size: {vocab_size}, Max Length: {max_length}")

Mounted at /content/drive
Dataset URL: https://www.kaggle.com/datasets/adityajn105/flickr8k
License(s): CC0-1.0
Downloading flickr8k.zip to /content
 99% 1.02G/1.04G [00:11<00:00, 90.8MB/s]
100% 1.04G/1.04G [00:11<00:00, 99.3MB/s]
✅ Dataset Ready! Vocab Size: 8781, Max Length: 37


In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Add, GlobalAveragePooling2D
from tensorflow.keras.applications import MobileNetV2

# ENCODER (Image Features)
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False
image_input = Input(shape=(224, 224, 3))
x = base_model(image_input)
x = GlobalAveragePooling2D()(x)
image_features = Dense(256, activation='relu')(x)

# DECODER (Sequence Processing)
caption_input = Input(shape=(max_length,))
y = Embedding(vocab_size, 256, mask_zero=False)(caption_input)
y = LSTM(256)(y)

# MERGE
decoder = Add()([image_features, y])
decoder = Dense(256, activation='relu')(decoder)
output = Dense(vocab_size, activation='softmax')(decoder)

model = Model(inputs=[image_input, caption_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')

print("✅ Model Architecture Built & Compiled.")

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
✅ Model Architecture Built & Compiled.


In [3]:
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array

class FlickrGenerator(Sequence):
    def __init__(self, df, tokenizer, max_length, vocab_size, batch_size=32):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.img_dir = 'flickr_data/Images'

    def __len__(self):
        return len(self.df) // self.batch_size

    def __getitem__(self, index):
        batch = self.df.iloc[index * self.batch_size : (index + 1) * self.batch_size]
        X_img, X_txt, y_label = [], [], []
        for _, row in batch.iterrows():
            img_path = os.path.join(self.img_dir, row['image'])
            img = load_img(img_path, target_size=(224, 224))
            img = img_to_array(img) / 255.0
            seq = self.tokenizer.texts_to_sequences([row['caption']])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=self.max_length, padding='post')[0]
                out_seq = to_categorical([out_seq], num_classes=self.vocab_size)[0]
                X_img.append(img)
                X_txt.append(in_seq)
                y_label.append(out_seq)
        return (np.array(X_img), np.array(X_txt)), np.array(y_label)

# 1. Initialize generator
generator = FlickrGenerator(df, tokenizer, max_length, vocab_size, batch_size=32)

# 2. Train (20 Epochs for good results)
print("⏳ Training starting...")
model.fit(generator, epochs=20, verbose=1)

# 3. Save to Drive Folder
model.save(SAVE_PATH + 'flickr8k_model.keras')
print(f"✅ Model and Tokenizer are now safely in: {SAVE_PATH}")

⏳ Training starting...


  self._warn_if_super_not_called()


Epoch 1/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1079s[0m 846ms/step - loss: 5.4735
Epoch 2/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m769s[0m 608ms/step - loss: 4.9500
Epoch 3/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 608ms/step - loss: 4.7926
Epoch 4/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m769s[0m 608ms/step - loss: 4.6794
Epoch 5/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 607ms/step - loss: 4.5897
Epoch 6/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 607ms/step - loss: 4.3332
Epoch 7/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m767s[0m 607ms/step - loss: 3.1557
Epoch 8/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 607ms/step - loss: 2.7789
Epoch 9/20
[1m1264/1264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 608ms/step - loss: 2.5712
Epoch 10/20
[1m1264/1264[0m [32m━