In [3]:
col_names = [
    'dv', 
    'fv',
    'dv-fv'
]

In [4]:
import configparser
import warnings
from pymongo import MongoClient

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

config = configparser.ConfigParser()
config.read('config.ini')
host = config['DEFAULT']['IP']
port = config['DEFAULT']['MongoDB-Port']

client = MongoClient(host, int(port))
db = client['Media-cloud']

# Topic moddeling for Facebook

In [5]:
records = []
for col_name in col_names:
    col = db[col_name]
    records.extend(list(col.find({"$text": {"$search": 'Facebook'}})))

In [6]:
import spacy
spacy.load('en')
from spacy.lang.en import English

parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if not token.orth_.isalpha():
            continue
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [7]:
from nltk.corpus import wordnet as wn


def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [8]:
from nltk.corpus import stopwords

en_stop = set(stopwords.words('english'))
en_stop.update([
    'family', 
    'domestic',
    'violence',
    'one', 
    'get',
    'would',
    'could',
    'go',
    'get',
    'take',
    'want',
    'make',
    'give'
])

In [9]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if get_lemma(token) not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [10]:
text_data = []
for record in records:
    text = record['text']
    tokens = prepare_text_for_lda(text)
    text_data.append(tokens)

In [28]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('output/corpus.pkl', 'wb'))
dictionary.save('output/dictionary.gensim')

In [36]:
import gensim

NUM_TOPICS = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('output/model.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.012*"woman" + 0.007*"people" + 0.005*"police" + 0.005*"facebook"')
(1, '0.014*"woman" + 0.005*"people" + 0.005*"child" + 0.004*"australia"')
(2, '0.011*"police" + 0.008*"people" + 0.007*"child" + 0.005*"government"')


In [37]:
dictionary = gensim.corpora.Dictionary.load('output/dictionary.gensim')
corpus = pickle.load(open('output/corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('output/model.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)