In [3]:
import os
import json
import numpy as np
from PIL import Image
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'  # Directory containing images
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'  # JSON file with training meme data
new_captions_file = 'home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'  # JSON file with new image data

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    train_img_captions_data[img_fname] = ['<start> ' + cap + ' <end>' for cap in img_captions]

# Prepare tokenizer on the training data
all_captions = [cap for captions in train_img_captions_data.values() for cap in captions]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path)
    img = img.resize((299, 299))
    img = np.array(img)
    img = np.expand_dims(img, axis=0)
    img = img / 127.5 - 1.0
    return img

inception = InceptionV3(weights='imagenet')
model_new = Model(inception.input, inception.layers[-2].output)

# Extract features for all images
def encode_images(images_dir, model):
    encoded_images = {}
    for img_name in os.listdir(images_dir):
        img_path = os.path.join(images_dir, img_name)
        img = preprocess_image(img_path)
        feature_vector = model.predict(img)
        encoded_images[img_name] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, captions in img_captions_data.items():
        for caption in captions:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

model = define_model(vocab_size, max_length)

# Train the model
model.fit([X1, X2], y, epochs=20, verbose=2)

# Caption generation function
def generate_caption(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length):
    results = []
    for item in new_captions_data:
        img_fname = item['img_fname']
        img_path = os.path.join(images_dir, img_fname)
        image_feature = model_new.predict(preprocess_image(img_path)).flatten().reshape((1, 2048))
        caption = generate_caption(model, tokenizer, image_feature, max_length)
        results.append({
            "img_fname": img_fname,
            "generated_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Image: {item['img_fname']} - Caption: {item['generated_caption']}")
jbjb

2024-06-03 00:52:40.565981: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels.h5


2024-06-03 00:53:05.341922: I tensorflow/core/common_runtime/executor.cc:1197] [/job:localhost/replica:0/task:0/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: input depth must be evenly divisible by filter depth: 4 vs 3
	 [[{{node model/activation/Relu}}]]


InvalidArgumentError: Graph execution error:

Detected at node 'model/activation/Relu' defined at (most recent call last):
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/raichuboy/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 728, in start
      self.io_loop.start()
    File "/home/raichuboy/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/raichuboy/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell
      result = self._run_cell(
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell
      result = runner(coro)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_247680/1763543474.py", line 57, in <module>
      encoded_images = encode_images(images_dir, model_new)
    File "/tmp/ipykernel_247680/1763543474.py", line 53, in encode_images
      feature_vector = model.predict(img)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 2382, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 2169, in predict_function
      return step_function(self, iterator)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 2155, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 2143, in run_step
      outputs = model.predict_step(data)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 2111, in predict_step
      return self(x, training=False)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/layers/core/activation.py", line 59, in call
      return self.activation(inputs)
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/activations.py", line 317, in relu
      return backend.relu(
    File "/home/raichuboy/anaconda3/envs/ml/lib/python3.10/site-packages/keras/backend.py", line 5396, in relu
      x = tf.nn.relu(x)
Node: 'model/activation/Relu'
input depth must be evenly divisible by filter depth: 4 vs 3
	 [[{{node model/activation/Relu}}]] [Op:__inference_predict_function_8981]

In [None]:
import os
import json
import numpy as np
from PIL import Image
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'  # Directory containing images
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'  # JSON file with training meme data
new_captions_file = 'home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'  # JSON file with new image data

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    train_img_captions_data[img_fname] = ['<start> ' + cap + ' <end>' for cap in img_captions]

# Prepare tokenizer on the training data
all_captions = [cap for captions in train_img_captions_data.values() for cap in captions]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path)
    # Convert image to RGB if it's not
    if img.mode != 'RGB':
        img = img.convert('RGB')
    img = img.resize((299, 299))
    img = np.array(img)
    img = np.expand_dims(img, axis=0)
    img = img / 127.5 - 1.0
    return img

inception = InceptionV3(weights='imagenet')
model_new = Model(inception.input, inception.layers[-2].output)

# Extract features for all images
def encode_images(images_dir, model):
    encoded_images = {}
    for img_name in os.listdir(images_dir):
        img_path = os.path.join(images_dir, img_name)
        img = preprocess_image(img_path)
        feature_vector = model.predict(img)
        encoded_images[img_name] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, captions in img_captions_data.items():
        for caption in captions:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

model = define_model(vocab_size, max_length)

# Train the model
model.fit([X1, X2], y, epochs=20, verbose=2)

# Caption generation function
def generate_caption(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length):
    results = []
    for item in new_captions_data:
        img_fname = item['img_fname']
        img_path = os.path.join(images_dir, img_fname)
        image_feature = model_new.predict(preprocess_image(img_path)).flatten().reshape((1, 2048))
        caption = generate_caption(model, tokenizer, image_feature, max_length)
        results.append({
            "img_fname": img_fname,
            "generated_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Image: {item['img_fname']} - Caption: {item['generated_caption']}")
jbj

2024-06-03 01:51:19.722918: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-03 01:51:21.975441: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.








In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam


# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'  # Directory containing images
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'  # JSON file with training meme data
new_captions_file = 'home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'  # JSON file with new image data

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    train_img_captions_data[img_fname] = ['<start> ' + cap + ' <end>' for cap in img_captions]

# Prepare tokenizer on the training data
all_captions = [cap for captions in train_img_captions_data.values() for cap in captions]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path)
    if img.mode == 'P':
        img = img.convert('RGBA')
    if img.mode == 'RGBA':
        img = img.convert('RGB')
    elif img.mode != 'RGB':
        img = img.convert('RGB')
    img = img.resize((299, 299))
    img = np.array(img)
    img = np.expand_dims(img, axis=0)
    img = img / 127.5 - 1.0
    return img

inception = InceptionV3(weights='imagenet')
model_new = Model(inception.input, inception.layers[-2].output)

# Extract features for all images
def encode_images(images_dir, model):
    encoded_images = {}
    for img_name in os.listdir(images_dir):
        img_path = os.path.join(images_dir, img_name)
        img = preprocess_image(img_path)
        feature_vector = model.predict(img)
        encoded_images[img_name] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, captions in img_captions_data.items():
        for caption in captions:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)  # Use 'float32' for mixed precision
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam())
    return model

model = define_model(vocab_size, max_length)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(64).prefetch(tf.data.AUTOTUNE)

# Train the model
model.fit(train_dataset, epochs=20, verbose=2)

# Caption generation function
def generate_caption(model, tokenizer, photo, max_length):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length):
    results = []
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        img_path = os.path.join(images_dir, img_fname)
        image_feature = model_new.predict(preprocess_image(img_path)).flatten().reshape((1, 2048))
        caption = generate_caption(model, tokenizer, image_feature, max_length)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")
bbikn

2024-06-03 02:45:08.040695: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.




In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'
new_captions_file = 'home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])  # Check if 'metaphors' key exists
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Prepare tokenizer on the training data
all_captions = [cap for item in train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images
def encode_images(images_dir, model):
    encoded_images = {}
    for img_name in os.listdir(images_dir):
        img_path = os.path.join(images_dir, img_name)
        img = preprocess_image(img_path)
        feature_vector = model.predict(img.reshape((1, 299, 299, 3)))
        encoded_images[img_name] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
    return model

model = define_model(vocab_size, max_length)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(64).prefetch(tf.data.AUTOTUNE)

# Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(train_dataset, epochs=10, verbose=2, callbacks=[early_stopping])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]  # Replace metaphor with its meaning
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length):
    results = []
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        img_path = os.path.join(images_dir, img_fname)
        image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
        data = train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})  # Extract metaphor meanings from input JSON
        caption = generate_caption(model, tokenizer, image_feature, max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")jbj




In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow_model_optimization.quantization.keras import quantize_annotate_layer, quantize_apply

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'
new_captions_file = '/home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])  # Check if 'metaphors' key exists
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Prepare tokenizer on the training data
all_captions = [cap for item in train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images
def encode_images(images_dir, model):
    encoded_images = {}
    for img_name in os.listdir(images_dir):
        img_path = os.path.join(images_dir, img_name)
        img = preprocess_image(img_path)
        feature_vector = model.predict(img.reshape((1, 299, 299, 3)))
        encoded_images[img_name] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model with quantization aware layers
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

# Annotate layers for quantization
def apply_quantization_to_dense(layer):
    if isinstance(layer, Dense):
        return quantize_annotate_layer(layer)
    return layer

model = define_model(vocab_size, max_length)
annotated_model = tf.keras.models.clone_model(model, clone_function=apply_quantization_to_dense)

# Apply quantization
quant_aware_model = quantize_apply(annotated_model)
quant_aware_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(64).prefetch(tf.data.AUTOTUNE)

# Train the quantized model with early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
quant_aware_model.fit(train_dataset, epochs=20, verbose=2, callbacks=[early_stopping])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]  # Replace metaphor with its meaning
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length):
    results = []
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        img_path = os.path.join(images_dir, img_fname)
        image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
        data = train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})  # Extract metaphor meanings from input JSON
        caption = generate_caption(model, tokenizer, image_feature, max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, quant_aware_model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")yyc


2024-06-03 04:26:43.329660: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.




In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow_model_optimization.quantization.keras import quantize_annotate_layer, quantize_apply

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'
new_captions_file = '/home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])  # Check if 'metaphors' key exists
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }
    
# Prepare tokenizer on the training data
all_captions = [cap for item in train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images in batches
def encode_images(images_dir, model, batch_size=32):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Annotate layers for quantization
def apply_quantization_to_dense(layer):
    if isinstance(layer, Dense):
        return quantize_annotate_layer(layer)
    return layer

annotated_model = tf.keras.models.clone_model(model, clone_function=apply_quantization_to_dense)

# Apply quantization
quant_aware_model = quantize_apply(annotated_model)
quant_aware_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(128).prefetch(tf.data.AUTOTUNE)

# Train the quantized model with early stopping and learning rate scheduling
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
quant_aware_model.fit(train_dataset, epochs=10, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]  # Replace metaphor with its meaning
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()
    
# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})  # Extract metaphor meanings from input JSON
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, quant_aware_model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")




In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow_model_optimization.quantization.keras import quantize_annotate_layer, quantize_apply

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'
new_captions_file = '/home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])  # Check if 'metaphors' key exists
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Prepare tokenizer on the training data
all_captions = [cap for item in train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images in batches
def encode_images(images_dir, model, batch_size=32):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Annotate layers for quantization
def apply_quantization_to_dense(layer):
    if isinstance(layer, Dense):
        return quantize_annotate_layer(layer)
    return layer

annotated_model = tf.keras.models.clone_model(model, clone_function=apply_quantization_to_dense)

# Apply quantization
quant_aware_model = quantize_apply(annotated_model)
quant_aware_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(256).prefetch(tf.data.AUTOTUNE)

# Train the quantized model with early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
quant_aware_model.fit(train_dataset, epochs=10, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]  # Replace metaphor with its meaning
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})  # Extract metaphor meanings from input JSON
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, quant_aware_model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")


2024-06-03 06:11:38.868553: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-03 06:11:41.312085: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.




In [9]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random

# Paths
images_dir = 'C:/Users/prani/OneDrive/Desktop/meme_caption/memes'
train_captions_file = 'C:/Users/prani/OneDrive/Desktop/meme_caption/memes-trainval.json'
new_captions_file = 'C:/Users/prani/OneDrive/Desktop/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Subsample the dataset
dataset_size = 1000  # The size of the subset you want
sampled_keys = random.sample(list(train_img_captions_data.keys()), dataset_size)
sampled_train_img_captions_data = {key: train_img_captions_data[key] for key in sampled_keys}

# Prepare tokenizer on the sampled data
all_captions = [cap for item in sampled_train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images in batches
def encode_images(images_dir, model, batch_size=32):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

def augment_image(image, model_new):
    img_array = preprocess_image(image)
    img_array = img_array.reshape((1,) + img_array.shape)
    augmented_images = []
    for batch in datagen.flow(img_array, batch_size=1):
        feature_vector = model_new.predict(batch).flatten()
        augmented_images.append(feature_vector)
        if len(augmented_images) >= 5:
            break
    return augmented_images

# Apply augmentation to the sampled dataset
sampled_encoded_images = {}
for img_fname in sampled_train_img_captions_data.keys():
    img_path = os.path.join(images_dir, img_fname)
    augmented_features = augment_image(img_path, model_new)
    for i, feature in enumerate(augmented_features):
        sampled_encoded_images[f"{img_fname}_aug_{i}"] = feature

# Reduce the number of captions per image
reduced_train_img_captions_data = {}
for img_fname, data in sampled_train_img_captions_data.items():
    captions = data['captions']
    reduced_captions = captions[:1]
    reduced_train_img_captions_data[img_fname] = {
        'captions': reduced_captions,
        'metaphors': data['metaphors']
    }

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, reduced_train_img_captions_data, sampled_encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(128).prefetch(tf.data.AUTOTUNE)

# Train the model with early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.fit(train_dataset, epochs=10, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = reduced_train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━

KeyError: 'memes_z614yp.png'

In [11]:
import os
import json
import csv
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random

# Paths
images_dir = 'C:/Users/prani/OneDrive/Desktop/meme_caption/memes'
train_captions_file = 'C:/Users/prani/OneDrive/Desktop/meme_caption/memes-trainval.json'
new_captions_file = 'C:/Users/prani/OneDrive/Desktop/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Subsample the dataset
dataset_size = 1000  # The size of the subset you want
sampled_keys = random.sample(list(train_img_captions_data.keys()), dataset_size)
sampled_train_img_captions_data = {key: train_img_captions_data[key] for key in sampled_keys}

# Prepare tokenizer on the sampled data
all_captions = [cap for item in sampled_train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images in batches
def encode_images(images_dir, model, batch_size=32):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

def augment_image(image, model_new):
    img_array = preprocess_image(image)
    img_array = img_array.reshape((1,) + img_array.shape)
    augmented_images = []
    for batch in datagen.flow(img_array, batch_size=1):
        feature_vector = model_new.predict(batch).flatten()
        augmented_images.append(feature_vector)
        if len(augmented_images) >= 5:
            break
    return augmented_images

# Apply augmentation to the sampled dataset
sampled_encoded_images = {}
for img_fname in sampled_train_img_captions_data.keys():
    img_path = os.path.join(images_dir, img_fname)
    augmented_features = augment_image(img_path, model_new)
    for i, feature in enumerate(augmented_features):
        sampled_encoded_images[f"{img_fname}_aug_{i}"] = feature

# Reduce the number of captions per image
reduced_train_img_captions_data = {}
for img_fname, data in sampled_train_img_captions_data.items():
    captions = data['captions']
    reduced_captions = captions[:1]
    reduced_train_img_captions_data[img_fname] = {
        'captions': reduced_captions,
        'metaphors': data['metaphors']
    }

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)
                X1.append(encoded_images[f"{img_name}_aug_0"])  # Use the first augmented image
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, reduced_train_img_captions_data, sampled_encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(128).prefetch(tf.data.AUTOTUNE)

# Train the model with early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.fit(train_dataset, epochs=2, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = reduced_train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Save generated captions to a CSV file
output_csv_file = 'generated_captions.csv'
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['post_id', 'meme_caption']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for item in generated_captions:
        writer.writerow({'post_id': item['post_id'], 'meme_caption': item['meme_caption']})

print(f"Generated captions saved to {output_csv_file}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━

ValueError: Input 0 of layer "functional_11" is incompatible with the layer: expected shape=(None, 2048), found shape=(None, 204800)

In [None]:
import os
import json
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from tensorflow_model_optimization.quantization.keras import quantize_annotate_layer, quantize_apply

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'
new_captions_file = '/home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])  # Check if 'metaphors' key exists
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Prepare tokenizer on the training data
all_captions = [cap for item in train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Extract features for all images in batches
def encode_images(images_dir, model, batch_size=32):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                X1.append(encoded_images[img_name])
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, train_img_captions_data, encoded_images)

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Annotate layers for quantization
def apply_quantization_to_dense(layer):
    if isinstance(layer, Dense):
        return quantize_annotate_layer(layer)
    return layer

annotated_model = tf.keras.models.clone_model(model, clone_function=apply_quantization_to_dense)

# Apply quantization
quant_aware_model = quantize_apply(annotated_model)
quant_aware_model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(256).prefetch(tf.data.AUTOTUNE)

# Train the quantized model with early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
quant_aware_model.fit(train_dataset, epochs=10, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]  # Replace metaphor with its meaning
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})  # Extract metaphor meanings from input JSON
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, quant_aware_model, tokenizer, max_length)

# Print or save the generated captions
for item in generated_captions:
    print(f"Post ID: {item['post_id']} - Meme Caption: {item['meme_caption']}")

2024-06-04 16:42:12.032781: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-04 16:42:15.260192: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.




In [None]:
import os
import json
import csv
import numpy as np
from PIL import Image
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random

# Check for GPU availability
if tf.config.list_physical_devices('GPU'):
    print("GPU is available")
else:
    print("GPU is not available")

# Paths
images_dir = '/home/raichuboy/Projects/meme_caption/memes'  # Directory containing images
train_captions_file = '/home/raichuboy/Projects/meme_caption/memes-trainval.json'  # JSON file with training meme data
new_captions_file = 'home/raichuboy/Projects/meme_caption/mlr_captioning_TEST.json'  # JSON file with new image data

# Load training captions
with open(train_captions_file, 'r') as f:
    train_captions_data = json.load(f)

# Extract relevant data for training
train_img_captions_data = {}
for item in train_captions_data:
    img_fname = item['img_fname']
    img_captions = item['img_captions']
    metaphors = item.get('metaphors', [])
    metaphor_meanings = {meta['metaphor']: meta['meaning'] for meta in metaphors}
    train_img_captions_data[img_fname] = {
        'captions': ['<start> ' + cap + ' <end>' for cap in img_captions],
        'metaphors': metaphor_meanings
    }

# Subsample the dataset
dataset_size = 500  # Reduce the dataset size further
sampled_keys = random.sample(list(train_img_captions_data.keys()), dataset_size)
sampled_train_img_captions_data = {key: train_img_captions_data[key] for key in sampled_keys}

# Prepare tokenizer on the sampled data
all_captions = [cap for item in sampled_train_img_captions_data.values() for cap in item['captions']]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# Prepare image feature extractor
def preprocess_image(img_path):
    img = Image.open(img_path).convert('RGB').resize((299, 299))
    img = np.array(img) / 255.0
    return img

resnet = ResNet50(include_top=False, weights='imagenet', input_shape=(299, 299, 3))

# Unfreeze the top layers of the model
for layer in resnet.layers[-10:]:
    layer.trainable = True
model_new = Model(resnet.input, resnet.layers[-1].output)

# Use a data generator to load images in batches and avoid memory overload
def encode_images(images_dir, model, batch_size=16):
    encoded_images = {}
    image_list = os.listdir(images_dir)
    for start in range(0, len(image_list), batch_size):
        end = start + batch_size
        batch_images = image_list[start:end]
        img_batch = np.array([preprocess_image(os.path.join(images_dir, img)) for img in batch_images])
        feature_vectors = model.predict(img_batch)
        for img, feature_vector in zip(batch_images, feature_vectors):
            encoded_images[img] = feature_vector.flatten()
    return encoded_images

encoded_images = encode_images(images_dir, model_new)

# Data augmentation
datagen = ImageDataGenerator(
    rotation_range=10,  # Reduced rotation range
    width_shift_range=0.1,  # Reduced width shift range
    height_shift_range=0.1,  # Reduced height shift range
    shear_range=0.1,  # Reduced shear range
    zoom_range=0.1,  # Reduced zoom range
    horizontal_flip=True,
    fill_mode='nearest')

def augment_image(image, model_new):
    img_array = preprocess_image(image)
    img_array = img_array.reshape((1,) + img_array.shape)
    augmented_images = []
    for batch in datagen.flow(img_array, batch_size=1):
        feature_vector = model_new.predict(batch).flatten()
        augmented_images.append(feature_vector)
        if len(augmented_images) >= 3:  # Reduced number of augmentations
            break
    return augmented_images

# Apply augmentation to the sampled dataset
sampled_encoded_images = {}
for img_fname in sampled_train_img_captions_data.keys():
    img_path = os.path.join(images_dir, img_fname)
    augmented_features = augment_image(img_path, model_new)
    for i, feature in enumerate(augmented_features):
        sampled_encoded_images[f"{img_fname}_aug_{i}"] = feature

# Reduce the number of captions per image
reduced_train_img_captions_data = {}
for img_fname, data in sampled_train_img_captions_data.items():
    captions = data['captions']
    reduced_captions = captions[:1]  # Only take the first caption
    reduced_train_img_captions_data[img_fname] = {
        'captions': reduced_captions,
        'metaphors': data['metaphors']
    }

# Prepare sequences for training
max_length = max(len(caption.split()) for caption in all_captions)

def create_sequences(tokenizer, max_length, img_captions_data, encoded_images):
    X1, X2, y = [], [], []
    for img_name, data in img_captions_data.items():
        for caption in data['captions']:
            seq = tokenizer.texts_to_sequences([caption])[0]
            for i in range(1, len(seq)):
                in_seq, out_seq = seq[:i], seq[i]
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                out_seq = to_categorical([out_seq], num_classes=vocab_size)
                X1.append(encoded_images[f"{img_name}_aug_0"])  # Use the first augmented image
                X2.append(in_seq)
                y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

X1, X2, y = create_sequences(tokenizer, max_length, reduced_train_img_captions_data, sampled_encoded_images)

# Ensure that X1 (image features) has shape (None, 2048)
X1 = np.array([feature[:2048] for feature in X1])  # Ensuring correct shape

# Define the model
def define_model(vocab_size, max_length):
    inputs1 = Input(shape=(2048,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax', dtype='float32')(decoder2)
    
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

model = define_model(vocab_size, max_length)

# Define a learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# Convert the training data to a tf.data dataset for efficient loading
train_dataset = tf.data.Dataset.from_tensor_slices(((X1, X2), y)).shuffle(buffer_size=1024).batch(16).prefetch(tf.data.AUTOTUNE)

# Train the model with early stopping and learning rate scheduling
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_callback = LearningRateScheduler(lr_scheduler)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))

# Ensure the model is trained on the GPU
with tf.device('/GPU:0'):
    model.fit(train_dataset, epochs=2, verbose=2, callbacks=[early_stopping, lr_callback])

# Caption generation function with metaphor meanings
def generate_caption(model, tokenizer, photo, max_length, metaphor_meanings):
    in_text = '<start>'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word.get(yhat)
        if word is None:
            break
        if word in metaphor_meanings:
            word = metaphor_meanings[word]
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()

# Load new data and generate captions
with open(new_captions_file, 'r') as f:
    new_captions_data = json.load(f)

def generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length, batch_size=32):
    results = []
    image_features = {}
    for item in new_captions_data:
        post_id = item['post_id']
        img_fname = item['img_fname']
        if img_fname not in image_features:
            img_path = os.path.join(images_dir, img_fname)
            image_feature = model_new.predict(preprocess_image(img_path).reshape((1, 299, 299, 3))).flatten().reshape((1, 2048))
            image_features[img_fname] = image_feature
        data = reduced_train_img_captions_data.get(img_fname, {})
        metaphor_meanings = data.get('metaphors', {})
        caption = generate_caption(model, tokenizer, image_features[img_fname], max_length, metaphor_meanings)
        results.append({
            "post_id": post_id,
            "meme_caption": caption
        })
    return results

# Generate captions for the new dataset
generated_captions = generate_captions_for_new_data(new_captions_data, model, tokenizer, max_length)

# Save generated captions to a CSV file
output_csv_file = 'generated_captions.csv'
with open(output_csv_file, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['post_id', 'meme_caption']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for item in generated_captions:
        writer.writerow({'post_id': item['post_id'], 'meme_caption': item['meme_caption']})

print(f"Generated captions saved to {output_csv_file}")


GPU is not available


2024-06-04 18:03:29.144823: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.






