In [1]:
import tensorflow as tf
import keras
import pandas as pd
import numpy as np

from keras.models import Model
from keras.layers import Dense
from keras.utils import plot_model

from keras.preprocessing import image
from keras.applications.vgg16 import VGG16

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, LSTM, CuDNNGRU, CuDNNLSTM
from keras.layers import Dropout, Conv1D, MaxPooling1D, GlobalMaxPool1D
from keras.layers import Bidirectional, InputLayer

from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
trainImgs = "COCO/segTrain2017/"
valImgs = "COCO/segVal2017/"

trainCaps = "COCO/annotations/captions_train2017.csv"
valCaps = "COCO/annotations/captions_val2017.csv"

ckPtDir = "log/imgCaption"
plotDir = "log/plot"

In [3]:
train = pd.read_csv(trainCaps, index_col=0)
train

Unnamed: 0,image_id,id,caption
0,391895,770337,A man with a red helmet on a small moped on a ...
1,391895,771687,Man riding a motor bike on a dirt road on the ...
2,391895,772707,A man riding on the back of a motorcycle.
3,391895,776154,A dirt path with a young person on a motor bik...
4,391895,781998,A man in a red shirt and a red hat is on a mot...
...,...,...,...
591748,475546,283,The patrons enjoy their beverages at the bar.
591749,475546,5620,People having a drink in a basement bar.
591750,475546,5716,A group of friends enjoys a drink while sittin...
591751,475546,8002,Group of people drinking wine at a public loca...


In [4]:
val = pd.read_csv(valCaps, index_col=0)
val

Unnamed: 0,image_id,id,caption
0,397133,370509,A man is in a kitchen making pizzas.
1,397133,370584,Man in apron standing on front of oven with pa...
2,397133,372252,A baker is working in the kitchen rolling dough.
3,397133,372765,A person standing by a stove in a kitchen.
4,397133,375891,A table with pies being made and a person stan...
...,...,...,...
25009,15335,505078,A group of people sitting at a table with food.
25010,15335,505954,"A man, woman, and boy are sitting at a table."
25011,15335,505978,"A man, woman and child eating together at a re..."
25012,15335,507940,A boy sitting between a man and a woman.


# CNN

In [5]:
base_model = VGG16(weights=None, classes=2048)
vgg16 = Model(inputs=base_model.input, outputs=base_model.output)
vgg16.summary()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
___________________________________________

## RNN

In [6]:
EMB_SIZE = 128
BATCH_SIZE = 512
MAX_LEN = 200
MAX_FEATURES = 20000

In [7]:
model = Sequential()
model.add(Embedding(MAX_FEATURES, EMB_SIZE, input_length=MAX_LEN))    
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dense(MAX_LEN, activation='sigmoid'))

In [8]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 128)          99328     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_2 (Dense)              (None, 200)               13000     
Total params: 2,680,584
Tr