In [139]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from pprint import pprint
from collections import Counter

In [140]:
stopwords = stopwords.words('english')
stopwords.extend(['from', 'city', 'tall', 'building', 'image', 'capture', 'street', 'white', 'car', 'black', 'sky', 'table', 'moped', 'red', 'two', 'house', 'water', 'boat', 'stand', 'scence', 'heart'])

In [141]:
main_data = pd.read_csv('results/post_survey.csv')
caption_a = main_data['caption_A'].tolist()
caption_b = main_data['caption_B'].tolist()

In [142]:
# Remove punctuation
caption_a = [simple_preprocess(str(doc), deacc=True) for doc in caption_a]
caption_b = [simple_preprocess(str(doc), deacc=True) for doc in caption_b]

# Remove stopwords
caption_a = [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in caption_a]
caption_b = [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in caption_b]

# Lemmatize
lemmatizer = WordNetLemmatizer()
caption_a = [[lemmatizer.lemmatize(word) for word in simple_preprocess(str(doc))] for doc in caption_a]
caption_b = [[lemmatizer.lemmatize(word) for word in simple_preprocess(str(doc))] for doc in caption_b]

# Remove specific words that are not useful
extend_list = ['capture', 'scene', 'urban']
caption_a = [[word for word in simple_preprocess(str(doc)) if word not in extend_list] for doc in caption_a]
caption_b = [[word for word in simple_preprocess(str(doc)) if word not in extend_list] for doc in caption_b]

In [143]:
# Get the top 10 most common words in the captions - caption_a and caption_b
def get_top_words(caption):
    words = [word for doc in caption for word in doc]
    word_freq = Counter(words)
    top_words = word_freq.most_common(10)
    return top_words

top_words_a = get_top_words(caption_a)
top_words_b = get_top_words(caption_b)

In [144]:
print('Top 10 words in caption A:', top_words_a)
print('Top 10 words in caption B:', top_words_b)

Top 10 words in caption A: [('serene', 24), ('bustling', 23), ('vibrant', 17), ('adorned', 11), ('glow', 11), ('cozy', 10), ('moment', 10), ('nestled', 10), ('amidst', 9), ('bathed', 9)]
Top 10 words in caption B: [('serene', 27), ('vibrant', 21), ('bustling', 20), ('dominating', 19), ('large', 16), ('color', 13), ('man', 11), ('standing', 10), ('building', 10), ('woman', 10)]


In [145]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

caption_a = list(sent_to_words(caption_a))
caption_b = list(sent_to_words(caption_b))

print(caption_a[:1])
print(caption_b[:1])

[['cozy', 'kitchen', 'refrigerator', 'standing', 'amidst', 'backdrop', 'wooden', 'cabinet', 'shelf', 'adorned', 'various', 'item', 'touch', 'personal', 'flair']]
[['cozy', 'kitchen', 'dominating', 'center', 'frame', 'large', 'refrigerator', 'standing', 'sleek', 'right', 'wooden', 'cabinet', 'glass', 'door', 'showcase', 'array', 'dish', 'glassware']]


In [146]:
# Create Dictionary
caption_a_id2word = corpora.Dictionary(caption_a)
caption_b_id2word = corpora.Dictionary(caption_b)

# Create Corpus
caption_a_texts = caption_a
caption_b_texts = caption_b

# Term Document Frequency
caption_a_corpus = [caption_a_id2word.doc2bow(text) for text in caption_a_texts]
caption_b_corpus = [caption_b_id2word.doc2bow(text) for text in caption_b_texts]

# View
print(caption_a_corpus[:1])
print(caption_b_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1)]]
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]


In [147]:
# Build LDA model
caption_a_lda_model = LdaModel(corpus=caption_a_corpus,
                               id2word=caption_a_id2word,
                               num_topics=5,
                               random_state=100,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=True)

caption_b_lda_model = LdaModel(corpus=caption_b_corpus,
                                id2word=caption_b_id2word,
                                num_topics=5,
                                random_state=100,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                alpha='auto',
                                per_word_topics=True)

In [148]:
# Print the Keyword in the 10 topics
pprint(caption_a_lda_model.print_topics())

[(0,
  '0.019*"bustling" + 0.016*"serene" + 0.013*"vibrant" + 0.013*"color" + '
  '0.013*"boat" + 0.013*"market" + 0.010*"harbor" + 0.010*"mix" + '
  '0.010*"silver" + 0.010*"side"'),
 (1,
  '0.015*"serene" + 0.015*"glow" + 0.012*"bustling" + 0.010*"townhouses" + '
  '0.010*"row" + 0.010*"setting" + 0.010*"warm" + 0.010*"nestled" + '
  '0.010*"bathed" + 0.010*"pink"'),
 (2,
  '0.013*"moment" + 0.013*"modern" + 0.011*"parked" + 0.011*"serene" + '
  '0.011*"painted" + 0.011*"blend" + 0.011*"architecture" + 0.011*"bustling" + '
  '0.009*"window" + 0.009*"large"'),
 (3,
  '0.012*"serene" + 0.010*"adorned" + 0.010*"man" + 0.010*"room" + '
  '0.008*"bustling" + 0.008*"home" + 0.008*"bathed" + 0.008*"greenery" + '
  '0.008*"amidst" + 0.008*"vibrant"'),
 (4,
  '0.012*"vibrant" + 0.012*"bustling" + 0.011*"coffee" + 0.009*"stand" + '
  '0.009*"amidst" + 0.009*"lush" + 0.009*"soft" + 0.009*"cozy" + '
  '0.009*"serene" + 0.009*"lamp"')]


In [149]:
pprint(caption_b_lda_model.print_topics())

[(0,
  '0.015*"serene" + 0.013*"boat" + 0.013*"cozy" + 0.013*"wooden" + '
  '0.010*"across" + 0.010*"kitchen" + 0.010*"array" + 0.007*"vibrant" + '
  '0.007*"tree" + 0.007*"color"'),
 (1,
  '0.019*"serene" + 0.017*"large" + 0.013*"dominating" + 0.012*"color" + '
  '0.012*"boat" + 0.011*"bustling" + 0.010*"vibrant" + 0.010*"window" + '
  '0.009*"facade" + 0.008*"frame"'),
 (2,
  '0.017*"serene" + 0.013*"inviting" + 0.013*"lush" + 0.013*"amidst" + '
  '0.013*"nestled" + 0.012*"stand" + 0.012*"backdrop" + 0.010*"seated" + '
  '0.010*"highway" + 0.010*"desk"'),
 (3,
  '0.020*"vibrant" + 0.015*"standing" + 0.015*"dominating" + 0.015*"bustling" '
  '+ 0.013*"man" + 0.012*"architecture" + 0.011*"building" + 0.010*"feature" + '
  '0.010*"blue" + 0.010*"large"'),
 (4,
  '0.013*"serene" + 0.010*"bustling" + 0.010*"range" + 0.010*"building" + '
  '0.010*"stone" + 0.009*"dominating" + 0.007*"reflecting" + 0.007*"steel" + '
  '0.007*"kitchen" + 0.007*"light"')]


In [150]:
# Print the top 10 words in each topic
def get_top_words_lda(lda_model):
    top_words = []
    for i in range(5):
        top_words.append(lda_model.show_topic(i, 1))
    return top_words

top_words_a_lda = get_top_words_lda(caption_a_lda_model)
top_words_b_lda = get_top_words_lda(caption_b_lda_model)

print('Top words in caption A LDA:', top_words_a_lda)
print('Top words in caption B LDA:', top_words_b_lda)

Top 10 words in caption A LDA: [[('bustling', 0.01917184)], [('serene', 0.014575318)], [('moment', 0.01328289)], [('serene', 0.012236543)], [('vibrant', 0.011524313)]]
Top 10 words in caption B LDA: [[('serene', 0.015270692)], [('serene', 0.019419868)], [('serene', 0.017491214)], [('vibrant', 0.020404482)], [('serene', 0.012934917)]]


In [151]:
# Compute Perplexity
print('\nPerplexity: ', caption_a_lda_model.log_perplexity(caption_a_corpus))  # a measure of how good the model is. lower the better.
print('\nPerplexity: ', caption_b_lda_model.log_perplexity(caption_b_corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -6.678012996944656

Perplexity:  -6.583239955791799


In [152]:
# Compute Coherence Score
caption_a_coherence_model_lda = CoherenceModel(model=caption_a_lda_model, texts=caption_a, dictionary=caption_a_id2word, coherence='c_v')
caption_a_coherence_lda = caption_a_coherence_model_lda.get_coherence()
print('Coherence Score for caption A: ', caption_a_coherence_lda)

caption_b_coherence_model_lda = CoherenceModel(model=caption_b_lda_model, texts=caption_b, dictionary=caption_b_id2word, coherence='c_v')
caption_b_coherence_lda = caption_b_coherence_model_lda.get_coherence()
print('Coherence Score for caption B: ', caption_b_coherence_lda)

Coherence Score for caption A:  0.38956967028974


Coherence Score for caption B:  0.4131604442484684


In [98]:
# Visualize the topics
import pyLDAvis.gensim
import pickle
import pyLDAvis
import os

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('results/ldavis_prepared_'+str(5))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(caption_a_lda_model, caption_a_corpus, caption_a_id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, 'results/ldavis_prepared_'+ str(5) +'.html')