# 

In [None]:
import pandas as pd
df=pd.read_csv("../input/flickr8k-text/flickr8k.token.txt",header=None,delimiter='\n')
df.head()

In [None]:
df.shape
df[0][0]

In [None]:
desc={}
for x in df[0]:
        first,second=x.split('\t')
        img_name=first.split(".")[0]
        
        if desc.get(img_name) is None :
            desc[img_name]= []
                
        desc[img_name].append(second)    

In [None]:
type(desc)

In [None]:
desc["1000268201_693b08cb0e"]


# Text Cleaning

In [None]:
import re
def clean_text(sentence):
    sentence= sentence.lower()
    sentence=re.sub("[^a-z]+"," ",sentence)
    sentence=sentence.split()
    
    sentence= [s for s in sentence if len(s)>1]
    sentence= " ".join(sentence)
    return sentence
    
    
    


In [None]:
clean_text("A cat is sitting on house # 4")

In [None]:
dir(desc.items())

In [None]:
for key,caption_list in desc.items():
    for i in range(len(caption_list)):
        caption_list[i]=clean_text(caption_list[i])

In [None]:
desc["1000268201_693b08cb0e"]

# Creating Vocab

In [None]:
with open("desc.txt","w") as f:
    f.write(str(desc))

In [None]:
import json
descriptions= {}
with open("desc.txt","r") as f:
    descriptions=f.read()
json_acceptable_string=descriptions.replace("'","\"")
descriptions=json.loads(json_acceptable_string)

In [None]:
print(type(descriptions))

In [None]:
# dir(descriptions)

In [None]:
vocab= set()
for key in descriptions.keys():
    [vocab.update(sentence.split()) for sentence in descriptions[key]]
    
print("Vocab Size: %d"%len(vocab))    

In [None]:
total_words=[]
for key in descriptions.keys():
    [total_words.append(i) for des in descriptions[key] for i in des.split()]
    
print("Total Words : %d"%len(total_words))    
    

In [None]:
import collections

counter= collections.Counter(total_words)
freq_count = dict(counter)
print(len(freq_count.keys()))

In [None]:
sorted_freq_count= sorted(freq_count.items(),reverse=True,key=lambda x: x[1])

sorted_freq_count= [x for x in sorted_freq_count if x[1]>10]

In [None]:
sorted_freq_count

In [None]:
total_words=[x[0] for x in sorted_freq_count ]
print(len(total_words))

# Prepare train and test data

In [None]:
train_data_file= None
with open("../input/flickr8k-text/flickr_8k.trainImages.txt","r") as f:
    train_data_file=f.read()
    

In [None]:
test_data_file=None
with open("../input/flickr8k-text/flickr_8k.testImages.txt","r") as f:
    test_data_file=f.read()

In [None]:
print(train_data_file)

In [None]:
train=[row.split(".")[0] for row in train_data_file.split("\n")[:-1]]

In [None]:
test=[row.split(".")[0] for row in test_data_file.split("\n")[:-1]]

In [None]:
train[:10]

In [None]:
train_descriptions={}
for img_id in train:
    train_descriptions[img_id]=[]
    for cap in descriptions[img_id]:
        cap_to_append="startseq " + cap + " endseq"
        train_descriptions[img_id].append(cap_to_append)

# Image Preprocessing

In [None]:
import keras
from keras.applications.resnet50 import ResNet50,preprocess_input

In [None]:
model= ResNet50(weights="imagenet",input_shape=(224,224,3))


In [None]:
dir(model)

In [None]:
model.summary()

In [None]:
from keras.models import Model,load_model
model_new= Model(model.input,model.layers[-2].output)

In [None]:
import keras
from keras.applications.resnet50 import preprocess_input
from keras.preprocessing import image
import numpy as np
def preprocess_img(img):
    img=image.load_img(img,target_size=(224,224))
    img=image.img_to_array(img)
    img=np.expand_dims(img,axis=0)
    
    img=preprocess_input(img)
    return img

In [None]:

IMG_PATH="../input/flickr8k/Images/"
img= preprocess_img(IMG_PATH+"1000268201_693b08cb0e.jpg")
plt.imshow(img[0])
plt.show()
print(img)


In [None]:
IMG_PATH="../input/flickr8k/Images/"
import cv2
import matplotlib.pyplot as plt

img=cv2.imread(IMG_PATH+"1000268201_693b08cb0e.jpg")
img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.axis("off")
plt.show()
print(img)

# Images to features

In [None]:
def encode_image(img):
    img= preprocess_img(img)
    feature_vector= model_new.predict(img)
    feature_vector=feature_vector.reshape((2048,))
    #print(feature_vector.shape)
    return feature_vector
            


In [None]:
encode_image(IMG_PATH+"1000268201_693b08cb0e.jpg")

In [None]:
from time import time
start = time()
encoding_train = {}

for ix,img_id in enumerate(train):
    img_path= IMG_PATH+ "/"+img_id+ ".jpg"
    encoding_train[img_id]=encode_image(img_path)
    
    if ix%100==0:
        print("Encoding in progress Time Step %d"%ix)
        
end_t= time()
print("Total time taken : ",end_t-start)


In [None]:
import pickle
with open("encoded_train_features.pkl","wb") as f:
    pickle.dump(encoding_train,f)

In [None]:
start = time()
encoding_test = {}

for ix,img_id in enumerate(test):
    img_path= IMG_PATH+ "/"+img_id+ ".jpg"
    encoding_test[img_id]=encode_image(img_path)
    
    if ix%100==0:
        print("Encoding in progress Time Step %d"%ix)
        
end_t= time()
print("Total time taken(test) : ",end_t-start)

In [None]:
with open("encoded_test_features.pkl","wb") as f:
    pickle.dump(encoding_test,f)

In [None]:
word_to_idx={}
idx_to_word={}

for i,word in enumerate(total_words):
    word_to_idx[word]=i+1
    idx_to_word[i+1]=word
    

In [None]:
word_to_idx["dog"]

In [None]:
idx_to_word[6]

In [None]:
print(len(idx_to_word))

In [None]:
idx_to_word[1846] ='startseq'
word_to_idx['startseq']=1846

idx_to_word[1847] ='endseq'
word_to_idx['endseq']=1847

vocab_size= len(word_to_idx)+ 1
print("VOCAB SIZE : ",vocab_size)




In [None]:
max_len=0
for key in train_descriptions.keys():
    for cap in train_descriptions[key]:
        max_len=max(max_len,len(cap.split()))
        
print(max_len)

# Custom Data Loader

In [None]:
from keras.utils import to_categorical 
from keras.preprocessing.sequence import pad_sequences
def data_generator(train_descriptions,encoding_train,word_to_idx,max_len,batch_size):
    x1,x2,y=[],[],[]
    
    
    n=0
    while True:
        for key,desc_list in train_descriptions.items():
            n+=1
            
            photo = encoding_train[key]
            for desc in desc_list:
                
                seq=[word_to_idx[word] for word in desc.split() if word in word_to_idx]
                for i in range(1,len(seq)):
                    xi= seq[0:i]
                    yi= seq[i]
                    
                    xi=pad_sequences([xi],maxlen=max_len,value=0,padding='post')[0]
                    yi=to_categorical([yi],num_classes=vocab_size)[0]
                    
                    x1.append(photo)
                    x2.append(xi)
                    y.append(yi)
                    
                    
                if n==batch_size:
                    yield([np.array(x1),np.array(x2)],np.array(y))
                    x1,x2,y= [],[],[]
                    n=0
                    
            
            

# Word embedding using glove6B50D

In [None]:
f = open("../input/glove6b50d/glove.6B.50d.txt",encoding='utf8')

In [None]:
embedding_index= {}

for line in f:
    values = line.split()
    
    word= values[0]
    word_embedding= np.array(values[1:],dtype='float')
    embedding_index[word]= word_embedding

In [None]:
embedding_index['apple']


In [None]:
def get_embedding_matrix():
    emb_dim=50
    matrix=np.zeros((vocab_size,emb_dim))
    for word,idx in word_to_idx.items():
        embedding_vector = embedding_index.get(word)
        
        if embedding_vector is not None:
            matrix[idx]=embedding_vector
            
    return matrix

In [None]:
embedding_matrix= get_embedding_matrix()


# Model Implementation

In [None]:
from keras.layers import Input,Dense,Dropout,Embedding,LSTM
from keras.layers.merge import add
input_img_features = Input(shape=(2048,))
inp_img1 = Dropout(0.3)(input_img_features)
inp_img2= Dense(256,activation='relu')(inp_img1)

In [None]:
input_captions= Input(shape=(max_len,))
inp_cap1 = Embedding(input_dim=vocab_size,output_dim=50,mask_zero=True)(input_captions)
inp_cap2= Dropout(0.3)(inp_cap1)
inp_cap3= LSTM(256)(inp_cap2)

In [None]:
from keras.applications.resnet50 import decode_predictions
decoder1= add([inp_img2,inp_cap3])
decoder2= Dense(256,activation='relu')(decoder1)
outputs= Dense(vocab_size,activation='softmax')(decoder2)

model = Model(inputs=[input_img_features,input_captions],outputs=outputs)
model.summary()

In [None]:
model.layers[2].set_weights([embedding_matrix])
model.layers[2].trainable=False

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

# Model Training

In [None]:
epochs=20
batch_size=3
steps=len(train_descriptions)//batch_size

In [None]:
def train():
    
    
    for i in range(epochs):
        generator =data_generator(train_descriptions,encoding_train,word_to_idx,max_len,batch_size)
        model.fit(generator,epochs=1,steps_per_epoch=steps,verbose=1)
        model.save('./model_weights/model_'+str(i)+'.h5')
        
    

In [None]:
train()

In [None]:
load_model('./model_weights/model_9.h5')

In [None]:
def predict_caption(photo):
    in_text="startseq"
    for i in range(max_len):
        sequence = [word_to_idx[w] for w in in_text.split() if w in word_to_idx]
        sequence = pad_sequences([sequence],maxlen=max_len,padding='post')
        
        ypred= model.predict([photo,sequence])
        ypred = ypred.argmax() # word with max prob always-Greedy sampling
        word= idx_to_word[ypred]
        in_text +=(' ' +word)
        
        if word== 'endseq':
            break
            
    final_caption =in_text.split()[1:-1]
    final_caption =' '.join(final_caption)
    return final_caption
        
        
        
    

In [None]:
# Pick some random images and see Results


for i in range(15):
    idx = np.random.randint(0,1000)
    all_img_names = list(encoding_test.keys())
    img_name =all_img_names[idx]
    photo_2048 = encoding_test[img_name].reshape((1,2048))
    
    i= plt.imread(IMG_PATH+img_name+".jpg")
    caption= predict_caption(photo_2048)
    print(caption)
    plt.imshow(i)
    plt.axis('off')
    plt.show()
    
    
    