In [1]:
import pandas as pd 
import numpy as np 
import os
import pickle

In [3]:
from tensorflow.keras.applications.vgg16 import VGG16,preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical,plot_model
from tensorflow.keras.layers import Input,Dense,LSTM,Embedding,Dropout,add

In [4]:
model = VGG16()
model = Model(inputs = model.inputs,outputs = model.layers[-2].output)
print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [5]:
from tqdm.notebook import tqdm

In [6]:
BASE_DIR = '/Users/91887/imageClassification/archive (1)'
WORKING_DIR = '/Users/91887/imageClassification'

In [None]:
features = {}
directory = os.path.join(BASE_DIR,'Images')

for img_name in tqdm(os.listdir(directory)):
    #load the image from file 
    img_path = directory+'/'+img_name
    image = load_img(img_path,target_size = (224,224))
    #image pixel to numpy array 
    image = img_to_array(image)
    #reshapre data for model
    image =  image.reshape((1,image.shape[0],image.shape[1],image.shape[2]))
    #preprocess image for vgg
    image = preprocess_input(image)
    #Extract the Features
    feature = model.predict(image,verbose=0)
    #get Image ID
    image_id = img_name.split('.')[0]
    #store the Features
    features[image_id] = feature

In [None]:
# Store Features in pickle 

pickle.dump(features,open(os.path.join(WORKING_DIR,'features.pkl'),'wb'))

In [7]:
# load Features from pickle 

with open(os.path.join(WORKING_DIR,'features.pkl'),'rb') as f:
    features = pickle.load(f)

In [8]:
with open(os.path.join(BASE_DIR,'captions.txt'),'r') as f:
    next(f)
    captions_doc = f.read()

In [9]:
# Create mapping of images to captions

# Creating Dict name mapping 

mapping = {}

#process lines 

for line in tqdm(captions_doc.split('\n')):
    #Split the line by comma
    tokens = line.split(',')
    if len(line)<2:
        continue
    image_id, caption = tokens[0], tokens[1:]
    #remove exe from image_id
    image_id = image_id.split('.')[0]
    #convert caption list into the string 
    caption =" ".join(caption)
    #Create a list 
    if image_id not in mapping:
        mapping[image_id] = []
    mapping[image_id].append(caption)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [10]:
def load_descriptions(doc):
    # Create mapping of images to captions

    # Creating Dict name mapping 

    mapping = {}

    #process lines 

    for line in tqdm(captions_doc.split('\n')):
        #Split the line by comma
        tokens = line.split(',')
        if len(line)<2:
            continue
        image_id, caption = tokens[0], tokens[1:]
        #remove exe from image_id
        image_id = image_id.split('.')[0]
        #convert caption list into the string 
        caption =" ".join(caption)
        #Create a list 
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(caption)
    return mapping

In [11]:
descriptions = load_descriptions(captions_doc)

  0%|          | 0/40456 [00:00<?, ?it/s]

In [12]:
import string

In [13]:
def clean(descriptions):
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for key,captions in descriptions.items():
        for i in range(len(captions)):
            caption = captions[i]
            #Preprocessing steps
            caption = caption.lower()
            caption = caption.replace('[^A-Za-z]', '')  #Delete special character and digit
            caption = caption.replace('\s+',' ')
            # add Start and End tags to the caption
            caption = 'startseq ' + " ".join([word for word in caption.split() if len(word)>1]) + ' endseq'
            captions[i] = caption

In [14]:
clean(descriptions)

In [15]:
# convert the loaded descriptions into a vocabulary of words
def to_vocabulary(descriptions):
    all_captions = []
    for key in descriptions:
        for caption in descriptions[key]:
            all_captions.append(caption)
    return all_captions

In [16]:
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size: %d' % len(vocabulary))

Vocabulary Size: 40455


In [17]:
pickle.dump(vocabulary,open(os.path.join(WORKING_DIR,'tokenizer.pkl'),'wb'))

In [18]:
vocabulary[:10]

['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
 'startseq girl going into wooden building endseq',
 'startseq little girl climbing into wooden playhouse endseq',
 'startseq little girl climbing the stairs to her playhouse endseq',
 'startseq little girl in pink dress going into wooden cabin endseq',
 'startseq black dog and spotted dog are fighting endseq',
 'startseq black dog and tri-colored dog playing with each other on the road endseq',
 'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
 'startseq two dogs of different breeds looking at each other on the road endseq',
 'startseq two dogs on pavement moving toward each other endseq']

In [19]:
# save descriptions to file, one per line
def save_descriptions(descriptions, filename):
        lines = list()
        for key, desc_list in descriptions.items():
            for desc in desc_list:
                lines.append(key + ' ' + desc)
        data = '\n'.join(lines)
        file = open(filename, 'w')
        file.write(data)
        file.close()

In [20]:
save_descriptions(descriptions, 'descriptions.txt')

In [21]:
image_ids = list(descriptions.keys())
split = int(len(image_ids)*0.80)
split

6472

In [22]:
train = image_ids[:split]
test = image_ids[split:]

In [23]:
from pickle import load

In [24]:
# load doc into memory
def load_doc(filename):
        # open the file as read only
        file = open(filename, 'r')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text

In [25]:
# load a pre-defined list of photo identifiers
def load_set(filename):
        doc = load_doc(filename)
        dataset = list()
        # process line by line
        for line in doc.split('\n'):
            # skip empty lines
            if len(line) < 1:
                continue
            # get the image identifier
            identifier = line.split('.')[0]
            dataset.append(identifier)
        return set(dataset)

In [26]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
        # load document
        doc = load_doc(filename)
        descriptions = dict()
        for line in doc.split('\n'):
            # split line by white space
            tokens = line.split()
            # split id from description
            image_id, image_desc = tokens[0], tokens[1:]
            # skip images not in the set
            if image_id in dataset:
                # create list
                if image_id not in descriptions:
                    descriptions[image_id] = list()
                # wrap description in tokens
                desc = ' '.join(image_desc)
                # store
                descriptions[image_id].append(desc)
        return descriptions

In [27]:
# load photo features
def load_photo_features(filename, dataset):
        # load all features
        all_features = load(open(filename, 'rb'))
        # filter features
        features = {k: all_features[k] for k in dataset}
        return features

In [28]:
print('Dataset: %d' % len(train))

Dataset: 6472


In [29]:
# descriptions
train_descriptions = load_clean_descriptions('descriptions.txt', train)

In [30]:
print('Descriptions: train=%d' % len(train_descriptions))

Descriptions: train=6472


In [31]:
train_features = load_photo_features('features.pkl', train)
print('Photos: train=%d' % len(train_features))

Photos: train=6472


In [32]:
train_descriptions

{'1000268201_693b08cb0e': ['startseq child in pink dress is climbing up set of stairs in an entry way endseq',
  'startseq girl going into wooden building endseq',
  'startseq little girl climbing into wooden playhouse endseq',
  'startseq little girl climbing the stairs to her playhouse endseq',
  'startseq little girl in pink dress going into wooden cabin endseq'],
 '1001773457_577c3a7d70': ['startseq black dog and spotted dog are fighting endseq',
  'startseq black dog and tri-colored dog playing with each other on the road endseq',
  'startseq black dog and white dog with brown spots are staring at each other in the street endseq',
  'startseq two dogs of different breeds looking at each other on the road endseq',
  'startseq two dogs on pavement moving toward each other endseq'],
 '1002674143_1b742ab4b8': ['startseq little girl covered in paint sits in front of painted rainbow with her hands in bowl endseq',
  'startseq little girl is sitting in front of large painted rainbow ends

In [33]:
# load doc into memory
def load_doc(filename):
        # open the file as read only
        file = open(filename, 'r')
        # read all text
        text = file.read()
        # close the file
        file.close()
        return text

In [34]:
# load a pre-defined list of photo identifiers
def load_set(filename):
        doc = load_doc(filename)
        dataset = list()
        # process line by line
        for line in doc.split('\n'):
            # skip empty lines
            if len(line) < 1:
                continue
            # get the image identifier
            identifier = line.split('.')[0]
            dataset.append(identifier)
        return set(dataset)

In [35]:
# load clean descriptions into memory
def load_clean_descriptions(filename, dataset):
        # load document
        doc = load_doc(filename)
        descriptions = dict()
        for line in doc.split('\n'):
            # split line by white space
            tokens = line.split()
            # split id from description
            image_id, image_desc = tokens[0], tokens[1:]
            # skip images not in the set
            if image_id in dataset:
                # create list
                if image_id not in descriptions:
                    descriptions[image_id] = list()
                # wrap description in tokens
                desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
                # store
                descriptions[image_id].append(desc)
        return descriptions

In [36]:
# load photo features
def load_photo_features(filename, dataset):
        # load all features
        all_features = load(open(filename, 'rb'))
        # filter features
        features = {k: all_features[k] for k in dataset}
        return features

In [37]:
# covert a dictionary of clean descriptions to a list of descriptions
def to_lines(descriptions):
        all_desc = list()
        for key in descriptions.keys():
            [all_desc.append(d) for d in descriptions[key]]
        return all_desc

In [38]:
# fit a tokenizer given caption descriptions
def create_tokenizer(descriptions):
        lines = to_lines(descriptions)
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(lines)
        return tokenizer


In [39]:
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 7748


In [40]:
def max_length(descriptions):
        lines = to_lines(descriptions)
        return max(len(d.split()) for d in lines)


In [41]:
def create_sequences(tokenizer, max_length, desc_list, photo):
        X1, X2, y = list(), list(), list()
        # walk through each description for the image
        for desc in desc_list:
            # encode the sequence
            seq = tokenizer.texts_to_sequences([desc])[0]
            # split one sequence into multiple X,y pairs
            for i in range(1, len(seq)):
                # split into input and output pair
                in_seq, out_seq = seq[:i], seq[i]
                # pad input sequence
                in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                # encode output sequence
                out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                # store
                X1.append(photo)
                X2.append(in_seq)
                y.append(out_seq)
        return np.array(X1), np.array(X2), np.array(y)

In [42]:
def define_model(vocab_size, max_length):
        # feature extractor model
        inputs1 = Input(shape=(4096,))
        fe1 = Dropout(0.5)(inputs1)
        fe2 = Dense(256, activation='relu')(fe1)
        # sequence model
        inputs2 = Input(shape=(max_length,))
        se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
        se2 = Dropout(0.5)(se1)
        se3 = LSTM(256)(se2)
        # decoder model
        decoder1 = add([fe2, se3])
        decoder2 = Dense(256, activation='relu')(decoder1)
        outputs = Dense(vocab_size, activation='softmax')(decoder2)
        # tie it together [image, seq] [word]
        model = Model(inputs=[inputs1, inputs2], outputs=outputs)
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        # summarize model
        print(model.summary())
        return model

In [43]:
def data_generator(descriptions, photos, tokenizer, max_length):
        # loop for ever over images
        while 1:
            for key, desc_list in descriptions.items():
                # retrieve the photo feature
                photo = photos[key][0]
                in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
                yield [[in_img, in_seq], out_word]

In [44]:
tokenizer = create_tokenizer(train_descriptions)

In [45]:
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# determine the maximum sequence length
max_length = max_length(train_descriptions)
print('Description Length: %d' % max_length)

Vocabulary Size: 7748
Description Length: 35


In [None]:
# train the model
model = define_model(vocab_size, max_length)
# train the model, run epochs manually and save after each epoch
epochs = 25
steps = len(train_descriptions)
for i in range(epochs):
        # create the data generator
        generator = data_generator(train_descriptions, train_features, tokenizer, max_length)
        # fit for one epoch
        model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
        model.save('model_(1)' + str(i) + '.h5')

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 35)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 4096)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 35, 256)      1983488     ['input_3[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 4096)         0           ['input_2[0][0]']                
                                                                                            

  model.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)


 814/6472 [==>...........................] - ETA: 2:53:02 - loss: 3.4186

In [44]:
print('Dataset: %d' % len(test))
# descriptions
test_descriptions = load_clean_descriptions('descriptions.txt', test)
print('Descriptions: test=%d' % len(test_descriptions))
# photo features
test_features = load_photo_features('features.pkl', test)
print('Photos: test=%d' % len(test_features))

Dataset: 1619
Descriptions: test=1619
Photos: test=1619


In [46]:
from tensorflow.keras.models import load_model

In [47]:
filename = 'model_9.h5'
model = load_model(filename)

In [59]:
# map an integer to a word
def word_for_id(integer, tokenizer):
        for word, index in tokenizer.word_index.items():
            if index == integer:
                return word
        return None

In [77]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length):
        # seed the generation process
        in_text = 'startseq'
        # iterate over the whole length of the sequence
        for i in range(max_length):
            # integer encode input sequence
            sequence = tokenizer.texts_to_sequences([in_text])[0]
            # pad input
            sequence = pad_sequences([sequence], maxlen=max_length)
            # predict next word
            yhat = model.predict([photo,sequence], verbose=0)
            # convert probability to integer
            yhat = np.argmax(yhat)
            # map integer to word
            word = word_for_id(yhat, tokenizer)
            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += ' ' + word
            # stop if we predict the end of the sequence
            if word == 'endseq':
                break
        return in_text


In [61]:
def evaluate_model(model, descriptions, photos, tokenizer, max_length):
        actual, predicted = list(), list()
        # step over the whole set
        for key, desc_list in descriptions.items():
            # generate description
            yhat = generate_desc(model, tokenizer, photos[key], max_length)
            # store actual and predicted
            references = [d.split() for d in desc_list]
            actual.append(references)
            predicted.append(yhat.split())
        # calculate BLEU score
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))


In [62]:
import numpy as np

In [64]:
from nltk.translate.bleu_score import corpus_bleu


In [66]:
evaluate_model(model, test_descriptions, test_features, tokenizer, max_length)

BLEU-1: 0.482821
BLEU-2: 0.236160
BLEU-3: 0.155306
BLEU-4: 0.066607


In [79]:
def extract_features(filename):
        model = VGG16()
        model = Model(inputs = model.inputs,outputs = model.layers[-2].output)
        # load the photo
        image = load_img(filename, target_size=(224, 224))
        # convert the image pixels to a numpy array
        image = img_to_array(image)
        # reshape data for the model
        image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
        # prepare the image for the VGG model
        image = preprocess_input(image)
        # get features
        feature = model.predict(image, verbose=0)
        return feature
    
# load the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocabulary)
# pre-define the max sequence length (from training)
max_length = 35
# load the model
model = load_model('model_9.h5')
# load and prepare the photograph
photo = extract_features('sample.jpg')
# generate description
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000001D263931F70>
Traceback (most recent call last):
  File "C:\Users\91887\anaconda3\lib\weakref.py", line 371, in remove
    self = selfref()
KeyboardInterrupt: 


startseq man in red and white is runs stands in the men endseq


In [81]:
# Remove the startseq and endseq
query = description
stopwords = ['startseq','endseq']
querywords = query.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)

print(result)

man in red and white is runs stands in the men


In [83]:
photo=extract_features('sample2.jpg')
description = generate_desc(model, tokenizer, photo, max_length)
print(description)

startseq man in black shirt is ball on the run endseq


In [84]:
# Remove the startseq and endseq
query = description
stopwords = ['startseq','endseq']
querywords = query.split()

resultwords  = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)

print(result)

man in black shirt is ball on the run


In [85]:
def predict_captions(image_name):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(vocabulary)
    # pre-define the max sequence length (from training)
    max_length = 35
    # load the model
    model = load_model('model_9.h5')
    # load and prepare the photograph
    photo = extract_features(image_name)
    # generate description
    description = generate_desc(model, tokenizer, photo, max_length)
    query = description
    stopwords = ['startseq','endseq']
    querywords = query.split()
    resultwords  = [word for word in querywords if word.lower() not in stopwords]
    result = ' '.join(resultwords)
    return result

'man in black shirt is ball on the run'