In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models import LdaModel
from gensim.utils import simple_preprocess
from pprint import pprint
from collections import Counter

In [83]:
stopwords = stopwords.words('english')
stopwords.extend(['from', 'city', 'tall', 'building', 'image', 'capture', 'street', 'white', 'car', 'black', 'sky', 'table', 'moped', 'red', 'two', 'house', 'water', 'boat'])

In [84]:
main_data = pd.read_csv('results/post_survey.csv')
caption_a = main_data['caption_A'].tolist()
caption_b = main_data['caption_B'].tolist()

In [85]:
# Remove punctuation
caption_a = [simple_preprocess(str(doc), deacc=True) for doc in caption_a]
caption_b = [simple_preprocess(str(doc), deacc=True) for doc in caption_b]

# Remove stopwords
caption_a = [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in caption_a]
caption_b = [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in caption_b]

# Lemmatize
lemmatizer = WordNetLemmatizer()
caption_a = [[lemmatizer.lemmatize(word) for word in simple_preprocess(str(doc))] for doc in caption_a]
caption_b = [[lemmatizer.lemmatize(word) for word in simple_preprocess(str(doc))] for doc in caption_b]

In [86]:
# Get the top 10 most common words in the captions - caption_a and caption_b
def get_top_words(caption):
    words = [word for doc in caption for word in doc]
    word_freq = Counter(words)
    top_words = word_freq.most_common(10)
    return top_words

top_words_a = get_top_words(caption_a)
top_words_b = get_top_words(caption_b)

In [87]:
print('Top 10 words in caption A:', top_words_a)
print('Top 10 words in caption B:', top_words_b)

Top 10 words in caption A: [('capture', 32), ('scene', 29), ('heart', 25), ('serene', 24), ('bustling', 23), ('vibrant', 17), ('stand', 14), ('urban', 12), ('adorned', 11), ('glow', 11)]
Top 10 words in caption B: [('capture', 54), ('scene', 48), ('serene', 27), ('vibrant', 21), ('bustling', 20), ('dominating', 19), ('heart', 18), ('large', 16), ('color', 13), ('stand', 12)]


In [88]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

caption_a = list(sent_to_words(caption_a))
caption_b = list(sent_to_words(caption_b))

print(caption_a[:1])
print(caption_b[:1])

[['cozy', 'kitchen', 'scene', 'refrigerator', 'standing', 'amidst', 'backdrop', 'wooden', 'cabinet', 'shelf', 'adorned', 'various', 'item', 'touch', 'personal', 'flair']]
[['capture', 'cozy', 'kitchen', 'scene', 'dominating', 'center', 'frame', 'large', 'refrigerator', 'standing', 'sleek', 'right', 'wooden', 'cabinet', 'glass', 'door', 'showcase', 'array', 'dish', 'glassware']]


In [89]:
# Create Dictionary
caption_a_id2word = corpora.Dictionary(caption_a)
caption_b_id2word = corpora.Dictionary(caption_b)

# Create Corpus
caption_a_texts = caption_a
caption_b_texts = caption_b

# Term Document Frequency
caption_a_corpus = [caption_a_id2word.doc2bow(text) for text in caption_a_texts]
caption_b_corpus = [caption_b_id2word.doc2bow(text) for text in caption_b_texts]

# View
print(caption_a_corpus[:1])
print(caption_b_corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]]
[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]]


In [90]:
# Build LDA model
caption_a_lda_model = LdaModel(corpus=caption_a_corpus,
                               id2word=caption_a_id2word,
                               num_topics=5,
                               random_state=100,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=True)

caption_b_lda_model = LdaModel(corpus=caption_b_corpus,
                                id2word=caption_b_id2word,
                                num_topics=5,
                                random_state=100,
                                update_every=1,
                                chunksize=100,
                                passes=10,
                                alpha='auto',
                                per_word_topics=True)

In [91]:
# Print the Keyword in the 10 topics
pprint(caption_a_lda_model.print_topics())

[(0,
  '0.013*"stand" + 0.013*"heart" + 0.011*"serene" + 0.010*"cozy" + '
  '0.010*"coffee" + 0.010*"bench" + 0.007*"capture" + 0.007*"woman" + '
  '0.007*"bustling" + 0.007*"blue"'),
 (1,
  '0.019*"scene" + 0.019*"capture" + 0.018*"bustling" + 0.016*"vibrant" + '
  '0.016*"heart" + 0.010*"serene" + 0.010*"urban" + 0.009*"color" + '
  '0.008*"boat" + 0.008*"stand"'),
 (2,
  '0.016*"bustling" + 0.016*"heart" + 0.013*"monitor" + 0.013*"woman" + '
  '0.010*"desk" + 0.010*"scene" + 0.010*"computer" + 0.007*"amidst" + '
  '0.007*"home" + 0.007*"sidewalk"'),
 (3,
  '0.018*"cozy" + 0.011*"capture" + 0.011*"heart" + 0.011*"set" + 0.011*"glow" '
  '+ 0.011*"bathed" + 0.011*"scene" + 0.011*"kitchen" + 0.011*"wooden" + '
  '0.008*"vibrant"'),
 (4,
  '0.024*"capture" + 0.024*"serene" + 0.016*"scene" + 0.014*"nestled" + '
  '0.010*"glow" + 0.010*"lush" + 0.008*"row" + 0.008*"soft" + 0.008*"green" + '
  '0.008*"roof"')]


In [92]:
pprint(caption_b_lda_model.print_topics())

[(0,
  '0.026*"capture" + 0.023*"scene" + 0.019*"serene" + 0.014*"boat" + '
  '0.013*"painted" + 0.012*"backdrop" + 0.011*"stand" + 0.011*"standing" + '
  '0.011*"shade" + 0.011*"amidst"'),
 (1,
  '0.028*"capture" + 0.026*"scene" + 0.016*"vibrant" + 0.013*"bustling" + '
  '0.010*"dominating" + 0.009*"tree" + 0.009*"large" + 0.008*"standing" + '
  '0.008*"heart" + 0.008*"man"'),
 (2,
  '0.025*"heart" + 0.014*"seated" + 0.014*"woman" + 0.014*"bustling" + '
  '0.009*"vibrant" + 0.009*"cozy" + 0.009*"man" + 0.009*"attention" + '
  '0.009*"kitchen" + 0.009*"truck"'),
 (3,
  '0.036*"capture" + 0.030*"scene" + 0.022*"serene" + 0.016*"color" + '
  '0.014*"dominating" + 0.013*"facade" + 0.013*"architecture" + 0.013*"modern" '
  '+ 0.011*"large" + 0.011*"reflecting"'),
 (4,
  '0.030*"capture" + 0.025*"scene" + 0.015*"serene" + 0.014*"dominating" + '
  '0.013*"wall" + 0.012*"green" + 0.012*"soft" + 0.012*"light" + 0.010*"stone" '
  '+ 0.010*"vibrant"')]


In [94]:
# Print the top 10 words in each topic
def get_top_words_lda(lda_model):
    top_words = []
    for i in range(5):
        top_words.append(lda_model.show_topic(i, 1))
    return top_words

top_words_a_lda = get_top_words_lda(caption_a_lda_model)
top_words_b_lda = get_top_words_lda(caption_b_lda_model)

print('Top 10 words in caption A LDA:', top_words_a_lda)
print('Top 10 words in caption B LDA:', top_words_b_lda)

Top 10 words in caption A LDA: [[('stand', 0.013256938)], [('scene', 0.019245142)], [('bustling', 0.016315464)], [('cozy', 0.018314531)], [('capture', 0.023877779)]]
Top 10 words in caption B LDA: [[('capture', 0.025909618)], [('capture', 0.0284813)], [('heart', 0.024514847)], [('capture', 0.035603095)], [('capture', 0.029718285)]]


In [78]:
# Compute Perplexity
print('\nPerplexity: ', caption_a_lda_model.log_perplexity(caption_a_corpus))  # a measure of how good the model is. lower the better.
print('\nPerplexity: ', caption_b_lda_model.log_perplexity(caption_b_corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -6.593530161224401

Perplexity:  -6.43324910353426


In [79]:
# Compute Coherence Score
caption_a_coherence_model_lda = CoherenceModel(model=caption_a_lda_model, texts=caption_a, dictionary=caption_a_id2word, coherence='c_v')
caption_a_coherence_lda = caption_a_coherence_model_lda.get_coherence()
print('Coherence Score for caption A: ', caption_a_coherence_lda)

caption_b_coherence_model_lda = CoherenceModel(model=caption_b_lda_model, texts=caption_b, dictionary=caption_b_id2word, coherence='c_v')
caption_b_coherence_lda = caption_b_coherence_model_lda.get_coherence()
print('Coherence Score for caption B: ', caption_b_coherence_lda)

Coherence Score for caption A:  0.32544308491555546
Coherence Score for caption B:  0.29999396279139234


In [98]:
# Visualize the topics
import pyLDAvis.gensim
import pickle
import pyLDAvis
import os

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('results/ldavis_prepared_'+str(5))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(caption_a_lda_model, caption_a_corpus, caption_a_id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, 'results/ldavis_prepared_'+ str(5) +'.html')

In [96]:
! pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m942.1 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1
