In [1]:
import numpy as np
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt 
import string
import os
from PIL import Image
import random, pickle
import glob
from pickle import dump, load
from tqdm.notebook import tqdm
from time import time
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, TimeDistributed, Dense, RepeatVector,\
                         Activation, Flatten, Reshape, concatenate, Dropout, BatchNormalization, Add
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.layers import Bidirectional, add
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tensorflow.keras import Input, layers
from tensorflow.keras import optimizers
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.tokenize import RegexpTokenizer

In [2]:
def load_and_process_descriptions(desc_file):
    def load_doc(filename):
        with open(filename, 'r') as file:
            return file.read()
    
    def load_descriptions(doc):
        mapping = dict()
        for line in doc.split('\n'):
            if len(line) < 2: continue
            tokens = line.split()
            image_id, image_desc = tokens[0], tokens[1:]
            image_id = image_id.split('.')[0]
            image_desc = ' '.join(image_desc)
            if image_id not in mapping:
                mapping[image_id] = list()
            mapping[image_id].append(image_desc)
        return mapping
    
    def clean_descriptions(descriptions):
        tokenizer = RegexpTokenizer(r'\w+')
        table = str.maketrans('', '', string.punctuation)
        for key, desc_list in descriptions.items():
            for i in range(len(desc_list)):
                desc = desc_list[i]
                desc = tokenizer.tokenize(desc)
                desc = [word.lower() for word in desc]
                desc = [word for word in desc if len(word) > 1]
                desc_list[i] = ' '.join(desc)
    
    doc = load_doc(desc_file)
    descriptions = load_descriptions(doc)
    clean_descriptions(descriptions)
    return descriptions

In [3]:
def get_vocabulary(descriptions):
    tokenizer = RegexpTokenizer(r'\w+')
    all_desc = set()
    for key in descriptions.keys():
        [all_desc.update(tokenizer.tokenize(d)) for d in descriptions[key]]
    return all_desc

# Save descriptions
def save_descriptions(descriptions, filename):
    lines = list()
    for key, desc_list in descriptions.items():
        for desc in desc_list:
            lines.append(key + ' ' + desc)
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [4]:
def setup_feature_extractor():
    model = InceptionV3(weights='imagenet')
    return Model(model.input, model.layers[-2].output)

def encode_image(image_path, model):
    """Encode single image"""
    def preprocess(image_path):
        img = image.load_img(image_path, target_size=(299, 299))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        return preprocess_input(x)
    
    img = preprocess(image_path)
    return model.predict(img, verbose=0)

def encode_images(image_paths, model):
    encoded = {}
    for img_path in tqdm(image_paths, desc="Encoding images"):
        encoded[os.path.basename(img_path)] = encode_image(img_path, model)
    return encoded

In [5]:
def prepare_text_data(descriptions):
    """Create tokenizer and vocabulary"""
    all_captions = []
    for key, val in descriptions.items():
        all_captions.extend(val)
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(cap.split()) for cap in all_captions)
    
    return tokenizer, vocab_size, max_length


In [6]:
def encode_images_to_disk(image_paths, model, output_path="image_features.pkl"):
    encoded = {}
    for img_path in tqdm(image_paths, desc="Encoding images"):
        img_id = os.path.basename(img_path)
        feature = encode_image(img_path, model)
        encoded[img_id] = feature.squeeze()  # Flatten to 1D
    with open(output_path, 'wb') as f:
        pickle.dump(encoded, f)
    return encoded

def load_encoded_images(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

In [7]:
class CaptionDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, descriptions, photos, wordtoix, max_length, batch_size=4):
        self.descriptions = {k: descriptions[k] for k in descriptions if k in photos}
        self.photos = photos
        self.wordtoix = wordtoix
        self.max_length = max_length
        self.batch_size = batch_size
        self.keys = list(self.descriptions.keys())

    def __len__(self):
        return int(np.ceil(len(self.keys) / self.batch_size))

    def __getitem__(self, index):
        X1, X2, y = [], [], []
        keys = self.keys[index * self.batch_size:(index + 1) * self.batch_size]
        
        for key in keys:
            photo = self.photos.get(key + '.jpg') or self.photos.get(key)
            if photo is None: continue
            photo = np.array(photo)
            
            for desc in self.descriptions[key]:
                seq = [self.wordtoix[w] for w in desc.split() if w in self.wordtoix]
                for i in range(1, len(seq)):
                    in_seq = pad_sequences([seq[:i]], maxlen=self.max_length)[0]
                    out_seq = seq[i]
                    X1.append(photo)
                    X2.append(in_seq)
                    y.append(out_seq)
                    
        return (np.array(X1), np.array(X2)), np.array(y)  


    
    def on_epoch_end(self):
        np.random.shuffle(self.keys)

In [8]:
def build_caption_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = Add()([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    return model

def train_model(model, data_gen, train_descriptions, epochs=40, batch_size=6):
    steps_per_epoch = len(train_descriptions) // batch_size
    history = model.fit(
        data_gen,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        verbose=1
    )
    return history

In [9]:
def generate_caption(model, photo, wordtoix, ixtoword, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = ixtoword.get(yhat)
        if word is None or word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq', '').replace('endseq', '').strip()

In [10]:
def prepare_text_data(descriptions, tokenizer_save_path="tokenizer.pkl"):
    """Create tokenizer and vocabulary"""
    all_captions = []
    for key, val in descriptions.items():
        all_captions.extend(val)
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_captions)
    vocab_size = len(tokenizer.word_index) + 1
    max_length = max(len(cap.split()) for cap in all_captions)
    
    # Save tokenizer to file
    with open(tokenizer_save_path, 'wb') as f:
        pickle.dump(tokenizer, f)
    
    return tokenizer, vocab_size, max_length

TypeError: Missing required positional argument

In [17]:
def run_image_captioning_pipeline():
    desc_file = "/home/abdelraheem/Abdo_Omda/Nlp/Flickr8k.token.txt"
    train_file = "/home/abdelraheem/Abdo_Omda/Nlp/Flickr_8k.trainImages.txt"
    image_dir = "/home/abdelraheem/Abdo_Omda/Nlp/Flicker_dataset/Images"
    encoded_features_path = "/home/abdelraheem/Abdo_Omda/Nlp/encoded_train_features.pkl"

    descriptions = load_and_process_descriptions(desc_file)
    train_names_full = open(train_file).read().strip().split('\n')
    train_names = [os.path.splitext(name)[0] for name in train_names_full]
    train_img_paths = [os.path.join(image_dir, name) for name in train_names_full]

    if not os.path.exists(encoded_features_path):
        feature_model = setup_feature_extractor()
        encode_images_to_disk(train_img_paths, feature_model, encoded_features_path)
    
    encoded_train = load_encoded_images(encoded_features_path)
    
    tokenizer, vocab_size, max_length = prepare_text_data(descriptions, tokenizer_save_path="tokenizer.pkl")
    wordtoix = tokenizer.word_index
    ixtoword = {v: k for k, v in wordtoix.items()}
    
    train_descriptions = {k: descriptions[k] for k in train_names if k in descriptions}
    train_features = {k: encoded_train[k + '.jpg'] for k in train_names if k + '.jpg' in encoded_train}
    
    valid_keys = set(train_descriptions.keys()) & set(train_features.keys())
    print(f"Valid keys for training: {len(valid_keys)}")
    
    train_descriptions = {k: train_descriptions[k] for k in valid_keys}
    train_features = {k: train_features[k] for k in valid_keys}
    
    data_gen = CaptionDataGenerator(train_descriptions, train_features, wordtoix, max_length, batch_size=6)

    # ✅ Fix: define the model before training
    model = build_caption_model(vocab_size, max_length)

    model.fit(data_gen, epochs=20)
    model.save('/home/abdelraheem/Abdo_Omda/Nlp/model_weights/final_model.h5')

    feature_model = setup_feature_extractor()
    test_img = "/home/abdelraheem/Abdo_Omda/Nlp/Flicker_dataset/Images/667626_18933d713e.jpg"
    test_feat = encode_image(test_img, feature_model).squeeze()
    caption = generate_caption(model, np.expand_dims(test_feat, axis=0), wordtoix, ixtoword, max_length)

    plt.imshow(plt.imread(test_img))
    plt.axis('off')
    plt.title("Generated: " + caption)
    plt.show()
    print("Generated Caption:", caption)

run_image_captioning_pipeline()

In [36]:
from tensorflow.keras.models import load_model
import numpy as np

model_path = "/home/abdelraheem/Abdo_Omda/Nlp/model_weights/final_model.h5"
test_img = "/home/abdelraheem/Abdo_Omda/Nlp/Flicker_dataset/Images/101654506_8eb26cfb60.jpg"
if 'model' not in globals():
    model = load_model(model_path, compile=False)
    feature_model = setup_feature_extractor()

def generate_caption_for_image(img_path):
    test_feat = encode_image(img_path, feature_model).squeeze()
    caption = generate_caption(model, np.expand_dims(test_feat, axis=0), wordtoix, ixtoword, max_length)
    return caption

caption = generate_caption_for_image(test_img)
print("Generated Caption:", caption)


Generated Caption: dog is running through the snow of the grass in the grass in the grass with the grass in the grass and white dog is running through the snow of the grass in the
