In [1]:
from functions import *
import json
from langid.langid import LanguageIdentifier, model
from collections import Counter
import operator
import random
import os
import langid
import seaborn as sns
import matplotlib.colors as mcolors
#https://towardsdatascience.com/visualizing-topic-models-with-scatterpies-and-t-sne-f21f228f7b02
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
#https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/#6.-What-is-the-Dominant-topic-and-its-percentage-contribution-in-each-document

In [13]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
#import pyLDAvis
#import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english') + stopwords.words('french')

In [14]:
#base_path = "/media/ruben/Data Drive/react-data/protest/{}".format("carlo")
base_path = "D:/react-data/protest/{}".format("selection2")

In [15]:
# Functions
def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def TopicTerms(topic):
    tm = dict(lda_model.show_topics(formatted=False,num_words=10))
    ttt = [i[0] for i in tm[topic]]
    return " | ".join(ttt)

In [16]:
# Import Data

dt = dict()
id2photo = dict()

for photo in [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]:
    photo_folder = os.path.join(base_path, photo)
    num_iterations = [fol for fol in os.listdir(photo_folder) if os.path.isdir(os.path.join(photo_folder,fol)) and "source" not in fol and "context" not in fol]
    num_iterations = len(num_iterations)

    start_iter = 1
    range_iter = [str(i) for i in list(range(1,num_iterations + 1))]

    folder_base = os.path.join(base_path,photo,photo)
    
    if photo not in id2photo.keys():
        id2photo.update({photo:list()})
    
    for iteration in range_iter:
        fn = os.path.join(folder_base + "_" +str(iteration),"txt", "parsed_text.json")
        
        with open(fn) as fp:
            pages = json.load(fp)
            
        for identifier,sentences in pages.items():

            id2photo[photo].append(identifier)
            sentences = [s.replace("\n","").lower() for s in sentences]
            sentences = [re.sub(' +', ' ', s) for s in sentences]
            
            language = langid.classify(" ".join(sentences))[0]
            if language not in dt.keys():
                dt.update({language:dict()})
            
            dt[language].update({identifier:" ".join(sentences)})

In [19]:
def Coherence(num_topics):
    sentences_ids = [[k,v] for k,v in dt['en'].items()]
    sentences = [i[1] for i in sentences_ids]
    sentences = list(sent_to_words(sentences))
    
    sentences = remove_stopwords(sentences)
    id2word = corpora.Dictionary(sentences)
    corpus = [id2word.doc2bow(text) for text in sentences]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=num_topics, 
                                               random_state=100,
                                               update_every=1,
                                               chunksize=50,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True) #minimum_probability=0.0
    # Compute Perplexity
    print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
    perp = lda_model.log_perplexity(corpus)
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=sentences, dictionary=id2word, coherence='c_v')
    coh = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coh)
    return [perp,coh]

In [21]:
d = dict()
for i in range(3,12):
    d.update({i:Coherence(i)})


Perplexity:  -8.640459906826967

Coherence Score:  0.4693401156172364

Perplexity:  -8.600929167548133

Coherence Score:  0.4452442826691174

Perplexity:  -8.579109908394148

Coherence Score:  0.4268836794738028

Perplexity:  -8.584628587966929

Coherence Score:  0.3815249853234279

Perplexity:  -8.53972758393893

Coherence Score:  0.49834176339030495

Perplexity:  -8.55614595186874

Coherence Score:  0.5305474187840149

Perplexity:  -8.556326939224281

Coherence Score:  0.47679782322850744

Perplexity:  -8.54605829647355

Coherence Score:  0.46253402854450715

Perplexity:  -8.547995041370287

Coherence Score:  0.5090746538366271


In [22]:
pd.DataFrame(list(d.items()))

Unnamed: 0,0,1
0,3,
1,4,
2,5,
3,6,
4,7,
5,8,
6,9,
7,10,
8,11,


In [23]:
d

{3: None,
 4: None,
 5: None,
 6: None,
 7: None,
 8: None,
 9: None,
 10: None,
 11: None}