In [8]:
import os 
import pickle
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm 
from tensorflow.keras.applications.vgg16 import VGG16 , preprocess_input 
from tensorflow.keras.preprocessing.image import load_img , img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Input , Dense , LSTM , Embedding , Dropout , add
# Load vgg16 Model
model = VGG16()
model = Model(inputs = model.inputs , outputs = model.layers[-2].output)

image = load_img("pexels-photo-1682649.jpeg", target_size=(224, 224))
    # convert image pixels to numpy array
image = img_to_array(image)
    # reshape data for model
image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    # preprocess image for vgg
image = preprocess_input(image)
fe=model.predict(image, verbose=0)

# load features from pickle
with open(os.path.join('features.pkl'), 'rb') as f:
    features = pickle.load(f)
with open(os.path.join('input\captions.txt'), 'r') as f:
    next(f)
    captions_doc = f.read()

# create mapping of image to captions
mapping = {}
# process lines
for line in tqdm(captions_doc.split('\n')):
    # split the line by comma(,)
    tokens = line.split(',')
    if len(line) < 2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    # remove extension from image ID
    image_id = image_id.split('.')[0]
    # convert caption list to string
    caption = " ".join(caption)
    # create list if needed
    if image_id not in mapping:
        mapping[image_id] = []
    # store the caption
    mapping[image_id].append(caption)
def clean(mapping):
    for key, captions in mapping.items():
        for i in range(len(captions)):
            # take one caption at a time
            caption = captions[i]
            # preprocessing steps
            # convert to lowercase
            caption = caption.lower()
            # delete digits, special chars, etc., 
            caption = caption.replace('[^A-Za-z]', '')
            # delete additional spaces
            caption = caption.replace('\s+', ' ')
            # add start and end tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

clean(mapping)

all_captions = []
for key in mapping:
    for caption in mapping[key]:
        all_captions.append(caption)

# tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_captions)
vocab_size = len(tokenizer.word_index) + 1

# get maximum length of the caption available
max_length = max(len(caption.split()) for caption in all_captions)

image_ids = list(mapping.keys())
split = int(len(image_ids) * 0.90)
train = image_ids[:split]
test = image_ids[split:]

# create data generator to get data in batch (avoids session crash)
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    # loop over images
    X1, X2, y = list(), list(), list()
    n = 0
    while 1:
        for key in data_keys:
            n += 1
            captions = mapping[key]
            # process each caption
            for caption in captions:
                # encode the sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                # split the sequence into X, y pairs
                for i in range(1, len(seq)):
                    # split into input and output pairs
                    in_seq, out_seq = seq[:i], seq[i]
                    # pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    # encode output sequence
                    out_seq = to_categorical([out_seq],num_classes=vocab_size)[0]
                    # store the sequences
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            if n == batch_size:
                X1, X2, y = np.array(X1), np.array(X2), np.array(y)
                yield [X1, X2], y
                X1, X2, y = list(), list(), list()
                n = 0

# encoder model
# image feature layers
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.4)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# sequence feature layers
inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.4)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam')

epochs = 20
batch_size = 32

model = tf.keras.models.load_model('best_model.h5')

def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate caption for an image
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'startseq'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'endseq':
            break
    return in_text

from PIL import Image
import matplotlib.pyplot as plt

y_pred = predict_caption(model, fe, tokenizer, max_length)
print(y_pred)






  0%|          | 0/40456 [00:00<?, ?it/s]

startseq man in blue shirt and jeans is running on grassy field endseq


In [10]:
import openai

# Define OpenAI API key 
openai.api_key = "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"

# Set up the model and prompt
model_engine = "text-davinci-003"
prompt = "generate 20 trending captions about"+y_pred+"in first persion"

# Generate a response
completion = openai.Completion.create(engine=model_engine,prompt=prompt,max_tokens=1024,n=1,stop=None,temperature=0.5,)

response = completion.choices[0].text
print(response)



1. I'm running through the grassy field in my blue shirt and jeans – feeling free and alive! 
2. I'm sprinting through the lush green grass, the wind in my hair and the sun on my face. 
3. Nothing can stop me as I run through the grassy field in my blue shirt and jeans. 
4. I'm running with the wind, feeling the energy of the grassy field beneath my feet. 
5. I'm running through the grassy field, my blue shirt and jeans flowing with the breeze. 
6. I'm feeling alive and free as I run through the grassy field in my blue shirt and jeans. 
7. I'm running through the grassy field, my blue shirt and jeans waving in the wind. 
8. I'm running through the grassy field, feeling the wind on my face and the sun on my skin. 
9. I'm running through the grassy field, feeling the joy of the moment in my blue shirt and jeans. 
10. I'm running through the grassy field, feeling the freedom and the beauty of the moment. 
11. I'm running through the grassy field, feeling the energy of the sun and the wi