In [None]:
import pandas as pd
import numpy as np
import re
from keras.preprocessing import image
import matplotlib.pyplot as plt
from keras.applications.resnet50 import ResNet50,preprocess_input
from keras.models import Model
from keras.layers import *
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [None]:
import os

## Creating Vocab dictionary and caption preprocessing

In [None]:
df=pd.read_csv('../input/flickr-image-dataset/flickr30k_images/results.csv',delimiter='|')

df.head()

df.iloc[0]

id=df['image_name'].values

id.shape

comment=df[' comment'].values

comment.shape

comment[5]

def sentence_cleaning(sentence):
    try:
        sentence=sentence.lower()
        sentence=re.sub('[^a-z]+',' ',sentence)
        sentence=sentence.split()
        sentence=[s for s in sentence if len(s)>1]
        sentence=' '.join(sentence)
        return(sentence)
    except:
        return(sentence_cleaning('A dog runs across the grass .'))

vocab_dic={}
for i in range(comment.shape[0]):
    if id[i] not in vocab_dic:
        vocab_dic[id[i]]=[]
    sen=sentence_cleaning(comment[i])
    vocab_dic[id[i]].append(sen)

len(vocab_dic)

print(vocab_dic['1000092795.jpg'])

word_dic={}

for i in vocab_dic:
    for j in vocab_dic[i]:
        l=j.split()
        for k in l:
            if k not in word_dic:
                word_dic[k]=1
            else:
                word_dic[k]+=1

print(len(word_dic))

final_words=[x for x in word_dic if word_dic[x]>10]

print(len(final_words))

for i in vocab_dic:
    for j in range(len(vocab_dic[i])):
        vocab_dic[i][j]='startseq '+vocab_dic[i][j]+' endseq'

s=1
word_to_idx={}
idx_to_word={}
for i in final_words:
    word_to_idx[i]=s
    idx_to_word[s]=i
    s+=1

print(len(word_to_idx))

### Two Special words
word_to_idx['startseq']=5119
word_to_idx['endseq']=5120
idx_to_word[5119]='startseq'
idx_to_word[5120]='endseq'

len(word_to_idx)

vocab_size=len(word_to_idx)+1 # adding one for 0 because that will also in our vector

max_len=20

## Image data preprocessing

In [None]:
model=ResNet50(weights='imagenet',input_shape=(224,224,3))

In [None]:
model.summary()

In [None]:
new_model=Model(model.input,model.layers[-2].output)

In [None]:
def preprocess_img(path):
    img=image.load_img(path,target_size=(224,224,3))
    img=image.img_to_array(img)
    img=img.reshape(1,224,224,3)
    img=preprocess_input(img)#mormalizing the img
    return img

In [None]:
img=preprocess_img('../input/flickr-image-dataset/flickr30k_images/flickr30k_images/10002456.jpg')

In [None]:
plt.imshow(img[0])

In [None]:
def encode_img(path):
    img=preprocess_img(path)
    feature_vector=new_model.predict(img)
    feature_vector=feature_vector.reshape((-1,))
    return feature_vector
    

In [None]:
# train_description['1244140539_da4804d828']

In [None]:
encoded_img_dic={}
s=0
for i in vocab_dic:
    path='../input/flickr-image-dataset/flickr30k_images/flickr30k_images/'+i
    encoded_img_dic[i]=encode_img(path)
    s+=1
    if s%100==0:
        print(s)

In [None]:
np.save('encoded_img_dic.npy',encoded_img_dic)

In [None]:
!ls -l --b=M  ./encoded_img_dic.npy | cut -d " " -f5

## Custom Data Generator

In [None]:
def data_generator(train_description,vocab_size,word_to_idx,encoded_img_dic,max_len,batch_size):
    X1,X2,y=[],[],[]
    n=0
    while True:
        for key,desc_list in train_description.items():
            n+=1
            encoding_of_photo=encoded_img_dic[key]
            for desc in desc_list:
                seq=[word_to_idx[i] for i in desc.split() if i in word_to_idx]
                for i in range(1,len(seq)):
                    xi=seq[0:i]
                    yi=seq[i]
                    
                    xi=pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi=to_categorical([yi],num_classes=vocab_size)[0]
                    
                    
                    X1.append(encoding_of_photo)
                    X2.append(xi)
                    y.append(yi)
                if n==batch_size:
                    yield [np.array(X1),np.array(X2)],np.array(y)
                    X1,X2,y=[],[],[]
                    n=0

In [None]:
# with open('glove.6B.50d.txt',encoding='utf8') as f:
#     glove_data=f.read()

# len(glove_data)

# type(glove_data)

# glove_data=glove_data.split('\n')

# type(glove_data)

# len(glove_data)

# glove_data=glove_data[:-1]

# len(glove_data)

# glove_data[0].split()[0]

# embedding_index={}
# for line in glove_data:
#     line=line.split()
#     word=line[0]
#     embeding=np.array(line[1:],dtype='float')
#     embedding_index[word]=embeding
    

# len(embedding_index)

# embedding_index['the'].shape

# def get_embedding_matrix():
#     dim=50
#     matrix=np.zeros((vocab_size,dim))
#     for word,number in word_to_idx.items():
#         embedding_vector=embedding_index.get(word)
#         if embedding_vector is not None:
#             matrix[word_to_idx[word]]=embedding_vector
#     return matrix
    

# embedding_matrix=get_embedding_matrix()

# len(embedding_matrix)

# embedding_matrix[word_to_idx['the']]



## loading embedding matrix using numpy

In [None]:
embedding_matrix=np.load('../input/embedding-matrix/embedding_matrix.npy')

In [None]:
embedding_matrix[3]

## Model Architecture

In [None]:
encoded_img_dic['10002456.jpg'].shape

In [None]:
## For images
input_img_features=Input(shape=(2048,))
inp_img1=Dropout(0.3)(input_img_features)
inp_img2=Dense(256,activation='relu')(inp_img1)


In [None]:
vocab_size

In [None]:
input_captions=Input(shape=(max_len,))
inp_cap1=Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
inp_cap2=Dropout(0.3)(inp_cap1)
inp_cap3=LSTM(256)(inp_cap2)

In [None]:
decoder1=add([inp_img2,inp_cap3])
decoder2=Dense(256,activation='relu')(decoder1)
outputs=Dense(vocab_size,activation='softmax')(decoder2)

In [None]:
actual_model=Model(inputs=[input_img_features,input_captions],outputs=outputs)


In [None]:
actual_model.summary()

In [None]:
actual_model.layers[2].output

In [None]:
actual_model.layers[2].set_weights([embedding_matrix])
actual_model.layers[2].trainable=False

In [None]:
actual_model.compile(loss='categorical_crossentropy',optimizer='adam')

## Training the Model

In [None]:
epochs=10
batch_size=3
steps=len(vocab_dic)//batch_size

In [None]:
def train():
    for i in range(epochs):
        generator=data_generator(vocab_dic,vocab_size,word_to_idx,encoded_img_dic,max_len,batch_size)
        actual_model.fit_generator(generator,epochs=1,steps_per_epoch=steps,verbose=1)
        actual_model.save('./Models/model'+str(i)+'.h5')
        

In [None]:
# os.mkdir('Models')

In [None]:
train()

In [None]:
max_len

## Predicting the caption using the trained model

In [None]:
def predict_caption(img):
    img=img.reshape(1,224,224,3)
    img=preprocess_input(img)
    feature_vector=new_model.predict(img)
    feature_vector=feature_vector.reshape((1,2048,1))
    in_text='startseq'
    for i in range(max_len):
        seq=[word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        seq=pad_sequences([seq],maxlen=max_len,padding='post')
        y_pred=actual_model.predict([feature_vector,seq])
        y_pred=y_pred.argmax()
        word=idx_to_word[y_pred]
        in_text+=' '+word
        
        if word=='endseq':
            break
    final_caption=in_text.split()[1:-1]
    final_caption=' '.join(final_caption)
    return final_caption

In [None]:
img=image.load_img('../input/flickr-image-dataset/flickr30k_images/flickr30k_images/10010052.jpg',target_size=(224,224,3))
img=image.img_to_array(img)
plt.imshow(img/255)

In [None]:
predict_caption(img)

In [None]:
plt.imshow(img)

In [None]:
ls -l --b=M  ./Models/model5.h5 | cut -d " " -f5