In [189]:
from PIL import Image
import requests as rq
from io import BytesIO
import pandas as pd
import numpy as np
# from keras.applications.resnet50 import ResNet50
from keras.preprocessing import image
# from keras.applications.resnet50 import preprocess_input, decode_predictions
from keras.preprocessing.image import img_to_array, load_img
from keras.applications.densenet import DenseNet121
from keras.layers import (LSTM, Embedding, Input, BatchNormalization, 
                          Dense, RepeatVector, Concatenate, 
                          TimeDistributed, Dropout)
from keras.models import Sequential, Model
from keras.optimizers import Adam
# from keras.applications.inception_v3 import preprocess_input, InceptionV3
from keras.preprocessing.image import img_to_array, load_img
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from keras.applications.vgg16 import VGG16, preprocess_input
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers.merge import add
from random import randint

In [None]:
# Pre-process images
# cnn_model = VGG16(include_top=False, weights='imagenet')
cnn_model = VGG16()
# re-structure the model
cnn_model.layers.pop()
cnn_model = Model(inputs=cnn_model.inputs, outputs=cnn_model.layers[-1].output)
num_images = 500

data = pd.read_csv("masterdata.csv")
images = []
features = []
image_dim = 224
for url in data.photo[:num_images]:
    response = rq.get(url)
    img = Image.open(BytesIO(response.content)).resize((image_dim,image_dim))
    x = image.img_to_array(img)
    x = x.reshape((1, x.shape[0], x.shape[1], x.shape[2]))
    x = preprocess_input(x)
    f = cnn_model.predict(x)
#     images.append(x)
    features.append(f)
    
    

In [None]:
# Get captions
captions = data.caption[:num_images]
captions = [c.replace('\n', ' ') for c in captions]

In [None]:
# Tokenize captions
tokenizer = Tokenizer(lower=False, char_level=True,filters='\t\n')
tokenizer.fit_on_texts(captions)
encoded_captions = tokenizer.texts_to_sequences(captions)
start = len(tokenizer.word_index) + 1
stop = start + 1
vocab_size = stop + 1

encoded_captions = [([start] + c) for c in encoded_captions]
encoded_captions = [(c + [stop]) for c in encoded_captions]


In [None]:
max_cap = max(len(c) for c in encoded_captions)
X1 = []
X2 = []
y = []

for i in range(len(encoded_captions)):
    c = encoded_captions[i]
    for j in range(len(c)):
        in_seq, out_seq = c[:j], c[j]
        in_seq = pad_sequences([in_seq], max_cap)[0]
        out_seq = to_categorical(out_seq, num_classes = vocab_size)
        X1.append(features[i])
        X2.append(in_seq)
        y.append(out_seq)
X1 = np.reshape(X1,(np.shape(X1)[0], np.shape(X1)[2]))

In [None]:
vocab_size = stop + 1

# feature extractor model
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

# sequence model
inputs2 = Input(shape=(max_cap,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

# decoder model
decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

# tie it together [image, seq] [word]
model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer='adam', 
              metrics = ['accuracy'])

# summarize model
print(model.summary())

In [None]:
amt_data = int(len(X1) * 4 / 5)

X1train = np.array(X1[:amt_data])
X1test = np.array(X1[amt_data:])

X2train = np.array(X2[:amt_data])
X2test = np.array(X2[amt_data:])

ytrain = np.array(y[:amt_data])
ytest = np.array(y[amt_data:])

model.fit([X1train, X2train], ytrain, epochs=20, verbose=1, validation_data=([X1test, X2test], ytest))





In [None]:
def to_letter(yhat):
    for k, v in tokenizer.word_index.items():
        if v == yhat:
            return k

In [None]:
for x in range(3):
    i = randint(0, num_images)
    print(captions[i])
    pred_encoded = [start]
    f = np.array(features[i])
    pred_pad = np.array(pad_sequences([pred_encoded], max_cap))
    
    yhat = model.predict([f, pred_pad])
    yhat = np.argmax(yhat)    
    pred_encoded.append(yhat)
    pred_pad = np.array(pad_sequences([pred_encoded], max_cap))
    pred_capt = to_letter(yhat)
    
    while yhat != stop and len(pred_capt) < 100:
        yhat = model.predict([f, pred_pad])
        yhat = np.argmax(yhat)
        if yhat is start or yhat is stop:
            break
        pred_encoded.append(yhat)
        pred_pad = np.array(pad_sequences([pred_encoded], max_cap))
        pred_capt += to_letter(yhat)
    print(pred_capt)
    print()
        
        
        