In [11]:
import tensorflow as tf
import keras
from keras.models import load_model
from keras.layers import Dense
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras import Model,layers
from tensorflow.keras.optimizers import RMSprop, Adam
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score, confusion_matrix
from nltk.translate.bleu_score import sentence_bleu
import shutil

# Common data handling libraries
import os
import string
import pandas as pd
import numpy as np
import glob
import pickle
# Gensim for LDA
import gensim
# NLTK for test processing
import nltk
nltk.download('stopwords')
# spacy for Lemmatization
import spacy
# Visualization
import matplotlib.pyplot as plt

In [31]:
data_name = 'flickr8k'

if data_name=='flickr8k':
    caption_path="/kaggle/input/flickr8k/flickr8k/flickr8k/Flickr8k.token.txt"
    img_path='/kaggle/input/flickr8k/flickr8k-images/flickr8k-images/'  
elif data_name=='coco':
    !mkdir /kaggle/working/coco_datasets
else:
    print("Choose appropriate dataname choices = ['flickr8k', 'coco']")

In [13]:
if data_name=='coco':
    # !wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip -P /kaggle/working
    # !unzip /kaggle/working/annotations_trainval2017.zip
    !pip install pycocotools
    from pycocotools.coco import COCO
    import numpy as np
    import skimage.io as io
    import matplotlib.pyplot as plt
    import pylab
    # initialize COCO api for instance annotations
    coco_caps=COCO('annotations/captions_train2017.json')
    img_captions=[]
    img_url=[]
    img_ids=[]
    i=0
    import json
    # Opening JSON file
    f = open('/kaggle/working/annotations/captions_train2017.json')
    dd = json.load(f)
    for dicti in dd['images']:
        annIds = coco_caps.getAnnIds(imgIds=dicti['id'])
        anns = coco_caps.loadAnns(annIds)
        captions = []
        for d in anns:
            captions.append(d['caption'])
        img_captions.append(' '.join(captions))
        img_url.append(dicti['coco_url'])
        img_ids.append('/kaggle/working/coco_datasets/'+str(dicti['id'])+'.jpg')
        I = io.imread(dicti['coco_url'])
        io.imsave('/kaggle/working/coco_datasets/'+str(dicti['id'])+'.jpg', I)
        if i==8000: break
        if i%500==0: print(i)
        i+=1
    
    img_cap_df = pd.DataFrame({})
    img_cap_df['image_id'] = img_ids #img_url[:8000]
    img_cap_df['caption'] = img_captions#[:8001]

In [32]:
if data_name=='flickr8k':
    with open(caption_path, 'r') as file: 
        #Read the file data
        data = file.read()

    image_id_list, caption_list = list(), list()

    # Create a list of all image names in the directory
    img_file_names_list = glob.glob(img_path+ '*.jpg')  
    img_file_names_list = [img_file.replace('\\','/') for img_file in img_file_names_list]  
    i=0
    image_id_dict=[]
    #iterate through each line
    for line in data.split('\n'):
        #line is empty continue with next line
        if len(line) < 2:
            continue
        #split the imageid and caption sep by tab
        image_id, caption = line.split('\t')    
        #strip the file extension from imageid
        image_id = image_id.split('#')[0]
        #append the file path to image_id 
        image_id = os.path.join(img_path, image_id)

        if image_id in img_file_names_list:
            #store it in list
            image_id_list.append(image_id)
            caption_list.append(caption)

    img_cap_df = pd.DataFrame({'image_id':image_id_list, 'caption':caption_list})
    # Group the captions by image_id to form a single sentence for each image
    img_cap_df = img_cap_df.groupby('image_id')['caption'].apply(lambda x : ' '.join(x)).reset_index(name='caption')

In [34]:
img_cap_df.head()

In [35]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)
img_cap_df.head()

## Data Cleaning

In [36]:
def clean_text(data):

    # convert to lower case
    data = [word.lower() for word in data.split()]
    
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    # remove punctuation from each word
    data = [word.translate(table) for word in data]
    
    # remove tokens with numbers in them
    data = [word for word in data if word.isalpha()]   
    
    # remove stopwords
    data = [word for word in data if word not in nltk.corpus.stopwords.words('english')]
    
    return data


data_caption = list(img_cap_df['caption'].apply(lambda x : clean_text(x)))
print(data_caption[:2])

In [37]:
# lemmatize the words
nlp = spacy.load(r"en_core_web_sm", disable=['parser', 'ner'])
#nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
data_caption_lemmatized = [[word.lemma_ for word in nlp(str(' '.join(doc))) if word.pos_ in allowed_postags] 
                           for doc in data_caption]
img_cap_df['caption_lemmatized'] = data_caption_lemmatized
img_cap_df.head()

In [38]:
# shuffle the DataFrame rows
img_cap_df = img_cap_df.sample(frac = 1)
# Train, valid, test split of dataset
train_df = img_cap_df[:-2000]
valid_df = img_cap_df[-2000:-1000]
test_df = img_cap_df[-1000:]
print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

In [39]:
# Create Dictionary
id2word = gensim.corpora.Dictionary(train_df['caption_lemmatized'])
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in train_df['caption_lemmatized']]
# View
print(corpus[:1])
# Human readable format of corpus (term-frequency)
print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])

In [40]:
# Find the optimal number of topics
START = 10
LIMIT = 100
STEP = 5
topic_range = range(START, LIMIT, STEP)

coherence_values = []
model_list = []
for num_topics in topic_range:
    print(num_topics)
    model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
#     model = gensim.models.LsiModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    model_list.append(model)
    coherencemodel = gensim.models.coherencemodel.CoherenceModel(model=model, texts=train_df['caption_lemmatized'], 
                                                                 dictionary=id2word, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

max_coherence_val = 0
optimal_model = None

# Print the coherence scores
for i, (m, cv) in enumerate(zip(topic_range, coherence_values)):
    if max_coherence_val < round(cv, 4):
        optimal_model = model_list[i]
        optimal_num_topics = m
        max_coherence_val = round(cv, 4)

    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

# plot coherence results
plt.plot(coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [41]:
# Print the Keyword in the 10 topics
for topic in optimal_model.print_topics():
    print(topic)
    
doc_lda = optimal_model[corpus]
print('Optimal Number of Topics :', optimal_num_topics)

# Compute Perplexity
print('\nPerplexity: ', optimal_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = gensim.models.coherencemodel.CoherenceModel(model=optimal_model, texts=data_caption_lemmatized, 
                                                                  dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [42]:
def predictTopics(corpus, optimal_model):

    caption_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(optimal_model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = optimal_model.show_topic(int(topic_num))
                topic_keywords = ", ".join([word for word, prop in wp])
                caption_topics_df = caption_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), 
                                                             ignore_index=True)
            else:
                break

    caption_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return caption_topics_df

In [43]:
def get_model(num_classes, model_type='vgg16'):
    
    if model_type=='vgg16':
        # # Create model
        pretrained_model = tf.keras.applications.VGG16(weights='imagenet', include_top=True, input_shape=(224,224,3))
        # pop the last softmax layer 
        pretrained_model.layers.pop()
    elif model_type=='vgg19':
        # # Create model
        pretrained_model = tf.keras.applications.VGG19(weights='imagenet', include_top=True, input_shape=(224,224,3))
        # pop the last softmax layer 
        pretrained_model.layers.pop()
    else:
        print("Not available")
        
    # freezing the remaining layers
    for layer in pretrained_model.layers:
        layer.trainable = False    

    output_model = keras.layers.Dense(2056, activation='tanh')(pretrained_model.layers[-1].output)
    output_model = keras.layers.Dropout(0.5)(output_model)
    output_model = keras.layers.Dense(1024, activation='tanh')(output_model)
    output_model = keras.layers.Dropout(0.5)(output_model)
    # output_model = vgg16_model.layers[-1].output
    output_model = keras.layers.Dense(num_classes, activation='softmax')(output_model)

    caption_model = keras.models.Model(pretrained_model.input, output_model)            
    optimizer = tf.keras.optimizers.Adam(lr=0.01)
    caption_model.compile(optimizer='adam', loss='categorical_crossentropy') #, metrics=["acc"])
#     caption_model.summary()
    return caption_model

In [44]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, images_paths, labels, model_type='vgg16', image_dimensions=(224, 224, 3), batch_size=64, shuffle=False):
        self.labels       = labels              # array of labels
        self.images_paths = images_paths        # array of image paths
        self.image_dim = image_dimensions
        self.batch_size   = batch_size          # batch size
        self.shuffle      = shuffle             # shuffle bool
        self.model_type = model_type

        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.images_paths) / self.batch_size))

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.images_paths))
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __getitem__(self, index):
        'Generate one batch of data'
        # selects indices of data for next batch
        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]

        # select data and load images
        labels = np.array([self.labels[k] for k in indexes])

        images = np.array([self.preprocessImageForVGG16(self.images_paths[k], self.model_type) for k in indexes])
        
        # select data and load images
        
        return images, labels
    
    
    #customize function used for color convetion
    def preprocessImageForVGG16(self, filename, model_type):
        # load image
        image = keras.preprocessing.image.load_img(filename, target_size=(self.image_dim[0], self.image_dim[1]))
        # convert the image pixels to a numpy array
        image = keras.preprocessing.image.img_to_array(image)
        # prepare the image for the VGG model
        if model_type=='vgg16':
            image = tf.keras.applications.vgg16.preprocess_input(image)
        elif model_type=='vgg19':
            image = tf.keras.applications.vgg19.preprocess_input(image)
        elif model_type=='resnet50':
            image = tf.keras.applications.resnet50.preprocess_input(image)
        return image

In [45]:
img_rows, img_cols = 224, 224 # Resolution of inputs
channel = 3
batch_size = 64
nb_epoch = 100

In [46]:
model_type = ['vgg16', 'vgg19']
best_bleu_score=-1
best_results_df=[]
optimal_LDA_model=[]
best_pretrained_model=[]
best_topic=0
best_pretrained_model_name=''
topic_model_bleu_score_dict={}

!mkdir /kaggle/working/results
mode = 0o666
topic_choice = (np.arange(10)*10)[3:4]
print("Topic choice: ", topic_choice)
for i, topic in enumerate(topic_range):
#     if topic not in topic_choice: continue
    print("Topic: ", topic)
    d_path = '/kaggle/working/results/'+str(topic_range[i])
    shutil.rmtree(d_path, ignore_errors=True)
    os.mkdir(d_path, mode)
    topic_model_bleu_score_dict[topic]={m:0 for m in model_type}
    LDA_model = model_list[i]
    
    train1_df = train_df.copy()
    valid1_df = valid_df.copy()
    test1_df = test_df.copy()
    
    df = predictTopics([id2word.doc2bow(text) for text in train1_df['caption_lemmatized']], LDA_model)
    train1_df = pd.concat([train1_df.reset_index(drop=True), df], axis=1)

    df = predictTopics([id2word.doc2bow(text) for text in valid1_df['caption_lemmatized']], LDA_model)
    valid1_df = pd.concat([valid1_df.reset_index(drop=True), df], axis=1)

    df = predictTopics([id2word.doc2bow(text) for text in test1_df['caption_lemmatized']], LDA_model)
    test1_df = pd.concat([test1_df.reset_index(drop=True), df], axis=1)
    
    X_train = train1_df['image_id'].values
    Y_train1 = train1_df['Dominant_Topic'].values #train_topics #
    Y_train = tf.keras.utils.to_categorical(Y_train1, num_classes=topic)

    X_valid = valid1_df['image_id'].values
    Y_valid1 = valid1_df['Dominant_Topic'].values #val_topics # 
    Y_valid = tf.keras.utils.to_categorical(Y_valid1, num_classes=topic)
    
    X_test = test1_df['image_id'].values
    Y_test = test1_df['Dominant_Topic'].values #test_topics #
    Y_test = tf.keras.utils.to_categorical(Y_test, num_classes=topic)

    for mt in model_type:
        print("Model: ", mt)
        model = get_model(topic, mt)
        
        # prepare data generator
        train_data = DataGenerator(X_train, Y_train, model_type=mt, batch_size=batch_size, shuffle=True)
        valid_data = DataGenerator(X_valid, Y_valid, model_type=mt, batch_size=batch_size, shuffle=False)
        test_data = DataGenerator(X_test, Y_test, model_type=mt, batch_size=1, shuffle=False)
        
        # Train the model
        # reduces learning rate if no improvement are seen
        learning_rate_reduction = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',patience=2,verbose=1,factor=0.5,min_lr=0.0000001)
        # stop training if no improvements are seen
        early_stop = keras.callbacks.EarlyStopping(monitor="val_loss",mode="min",patience=5)
        # saves model weights to file
        checkpoint = keras.callbacks.ModelCheckpoint('/kaggle/working/topic_predictor_model.hdf5',
                                                     monitor='val_loss',verbose=1,save_best_only=True,mode='min',save_weights_only=True)
        history = model.fit_generator(generator=train_data,
                                   validation_data=valid_data,
                                   epochs=nb_epoch,
                                   steps_per_epoch=len(train_data),
                                   validation_steps =len(valid_data),
                                   callbacks=[learning_rate_reduction, early_stop, checkpoint],
                                   verbose=2,
                                   )
        
        # plot training history
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.plot(history.history['loss'], label="TrainLoss")
        ax.plot(history.history['val_loss'], label="ValLoss")
        ax.legend(loc='best', shadow=True)
        fig.savefig(d_path+"/Topic_"+str(topic)+"_model_"+mt+"_fig.png")

        # predict on data
        pred_caption_topics_prob = model.predict_generator(test_data)
        pred_caption_topics = np.argmax(pred_caption_topics_prob, axis=1)
        
        pred_words = [list(dict(LDA_model.show_topic(t)).keys()) for t in pred_caption_topics]
        ground_truth_words = [list(dict(LDA_model.show_topic(t)).keys()) for t in test1_df['Dominant_Topic'].values.astype(int)]

        results_df = pd.DataFrame({ 'image_id':X_test, 'pred_topics':pred_caption_topics, 'ground_truth': test1_df['Dominant_Topic'].values.astype(int), 'pred_topics_words': pred_words, 'ground_truth_words': ground_truth_words})
        results_df.to_csv(d_path+"/Topic_"+str(topic)+"_model_"+mt+"_df.csv")
        hypothesis = [list(dict(LDA_model.show_topic(gd_topic)).keys()) for gd_topic in results_df['ground_truth'].values]
        references = [list(dict(LDA_model.show_topic(pred_topic)).keys()) for pred_topic in results_df['pred_topics'].values]
        bleu_score = [sentence_bleu([ref], hyp) for ref, hyp in zip(references, hypothesis)]
        mean_bleu_score = np.mean(bleu_score)
        topic_model_bleu_score_dict[topic][mt] = mean_bleu_score
        print("BLEU={:4.3f}".format(mean_bleu_score))
        if mean_bleu_score > best_bleu_score:
            best_bleu_score = mean_bleu_score
            best_results_df = results_df
            optimal_LDA_model=LDA_model
            best_pretrained_model=model
            best_pretrained_model_name=mt
            best_topic=topic

In [None]:
print("best_bleu_score: ", best_bleu_score)
print("best_pretrained_model_name: ", best_pretrained_model_name)
print("best_no_topic: ", best_topic)
best_results_df.head()

In [None]:
topic_model_bleu_score_dict

In [None]:
!mkdir /kaggle/working/topics_cloud

In [None]:
!pip install wordcloud
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
for i, optimal_lda_model in enumerate(model_list):
    mode = 0o666
    if topic_range[i]%10!=0: continue
    d_path = '/kaggle/working/results/'+str(topic_range[i])+'/topics_cloud/'
    shutil.rmtree(d_path, ignore_errors=True)
    os.mkdir(d_path, mode)
    for t in range(optimal_lda_model.num_topics):
        plt.figure()
        plt.imshow(WordCloud().fit_words(dict(optimal_lda_model.show_topic(t))))
        plt.axis("off")
        plt.title("Topic #" + str(t))
#         plt.show()
        plt.savefig('/kaggle/working/results/'+str(topic_range[i])+'/topics_cloud'+'/t'+str(t)+'.png')

In [None]:
import shutil
# shutil.make_archive('vgg16_vgg_19_topics', 'zip', '/kaggle/working/topics_cloud')
shutil.make_archive('results', 'zip', '/kaggle/working/results')

In [106]:
from PIL import Image
img = Image.open('/kaggle/input/flickr8k/flickr8k-images/flickr8k-images/3375991133_87d7c40925.jpg')
img

In [None]:
!pip install wordcloud
from wordcloud import WordCloud

In [108]:
# plot training history
fig, ax = plt.subplots(1, 2, figsize=(10, 5))
ax[0].imshow(img)
ax[1].imshow(WordCloud().fit_words(dict(model_list[11].show_topic(54))))
# ax.legend(loc='best', shadow=True)
# fig.savefig(d_path+"/Topic_"+str(topic)+"_model_"+mt+"_fig.png")