#### Notebook purpose
This notebooks aims to summarize the sample of sports and politics articles collected on http://eventregistry.org into a finite number of main "topics" using the Latent Dirichlet Allocation (LDA) model. Due to time restrictions we didn't spend time fine-tuning neither the models nor the number of topics but this could have been done using likelihood or perplexity criteria.

#### Load data and set up parameters used in the rest of the notebook

In [211]:
import phq_utils.utils_lda as utils

In [212]:
import pandas as pd

In [None]:
data_path = '../../data/'
modelspath = '../../models/'

In [213]:
sports_df = pd.read_csv(f'{data_path}sports_10k.csv')
politics_df = pd.read_csv(f'{data_path}politics_10k.csv')

In [214]:
from gensim.corpora import Dictionary
from gensim.corpora.mmcorpus import MmCorpus

In [215]:
n_docs = 10000
n_topics = 20
max_freq = 0.4
min_wordcount = int(0.01 * n_docs)

#### Build dictionary and bag of words for sports articles contents

In [216]:
%time docs = [utils.process_text(row['body']) for index, row in sports_df.iterrows()]

CPU times: user 2min 42s, sys: 259 ms, total: 2min 42s
Wall time: 2min 42s


In [217]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
dictionary.save(f'{data_path}sports_body.dict')

In [218]:
bows = [dictionary.doc2bow(doc) for doc in docs]
MmCorpus.serialize(f'{data_path}sports_body.bow', bows)

#### Build dictionary and bag of words for politics articles contents

In [219]:
%time docs = [utils.process_text(row['body']) for index, row in politics_df.iterrows()]

CPU times: user 3min 55s, sys: 327 ms, total: 3min 55s
Wall time: 3min 55s


In [220]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
dictionary.save(f'{data_path}politics_body.dict')

In [221]:
bows = [dictionary.doc2bow(doc) for doc in docs]
MmCorpus.serialize(f'{data_path}politics_body.bow', bows)

#### Build dictionary and bag of words for sports articles titles

In [229]:
%time docs = [utils.process_text(row['title'] if not pd.isna(row['title']) else '') for index, row in sports_df.iterrows()]

CPU times: user 3.68 s, sys: 24.2 ms, total: 3.71 s
Wall time: 3.71 s


In [230]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
dictionary.save(f'{data_path}sports_title.dict')

In [231]:
bows = [dictionary.doc2bow(doc) for doc in docs]
MmCorpus.serialize(f'{data_path}sports_title.bow', bows)

#### Build dictionary and bag of words for politics articles titles

In [232]:
%time docs = [utils.process_text(row['title'] if not pd.isna(row['title']) else '') for index, row in politics_df.iterrows()]

CPU times: user 3.85 s, sys: 6.24 ms, total: 3.85 s
Wall time: 3.85 s


In [233]:
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)
dictionary.save(f'{data_path}politics_title.dict')

In [234]:
bows = [dictionary.doc2bow(doc) for doc in docs]
MmCorpus.serialize(f'{data_path}politics_title.bow', bows)

#### LDA modeling for politics and sports articles on body and titles

In [206]:
from gensim.models import ldamodel

In [235]:
%%time
for category in ['sports', 'politics']:
    for item in ['body', 'title']:
        dictionary = Dictionary.load(f'{data_path}{category}_{item}.dict')
        corpus = MmCorpus(f'{data_path}{category}_{item}.bow')
        model = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics, random_state=42)
        model.save(f'{models_path}{category}_{item}_{n_docs}_docs_{n_topics}_topics.model')

CPU times: user 1min 41s, sys: 16 s, total: 1min 57s
Wall time: 23.6 s


#### Look at the 10 most important tokens per topic to get an idea of what it represents

In [236]:
import gensim

In [258]:
topic_tokens_top3 = {}
for category in ['sports', 'politics']:
    for item in ['body', 'title']:
        dictionary = Dictionary.load(f'{data_path}{category}_{item}.dict')
        model = gensim.models.LdaModel.load(f'{models_path}{category}_{item}_{n_docs}_docs_{n_topics}_topics.model')
        print(f'Topics for {category} articles {item}:')
        top3_tokens_list = []
        for ix in range(model.num_topics):
            topic_tokens = model.get_topic_terms(ix, 10)
            topic_tokens_list = [dictionary[pair[0]] for pair in topic_tokens]
            print(f'topic {ix} ~ ({", ".join(topic_tokens_list)})')
            top3_tokens_list.append(topic_tokens_list[:3])
        topic_tokens_top3.update({f'{category}_{item}': [' \n'.join(top3) for top3 in top3_tokens_list]})
        print()

Topics for sports articles body:
topic 0 ~ (title, stpierre, ufc, champion, mma, championship, state, fight, athlete, usa)
topic 1 ~ (trail, bike, brand, area, photo, running, event, mountain, health, park)
topic 2 ~ (event, golf, club, olympic, tour, championship, people, best, olympics, surfing)
topic 3 ~ (player, football, coach, play, basketball, baseball, even, thing, really, good)
topic 4 ~ (league, player, season, club, goal, side, champion, fan, win, point)
topic 5 ~ (second, cup, win, round, three, irish, race, player, winner, got)
topic 6 ~ (car, speed, engine, fury, system, model, power, drive, feature, driving)
topic 7 ~ (cricket, rugby, player, cup, india, england, play, icc, good, season)
topic 8 ~ (tennis, player, know, win, say, think, career, even, going, still)
topic 9 ~ (woman, hockey, girl, cricket, player, state, school, coach, national, men)
topic 10 ~ (live, ireland, watch, match, bbc, england, sky, wale, six, saturday)
topic 11 ~ (rugby, india, player, cup, coun

#### Project each article in the corresponding topic space and save coordinates

In [259]:
import numpy as np

In [261]:
%%time
for category in ['sports', 'politics']:
    for item in ['body', 'title']:
        corpus = MmCorpus(f'{data_path}{category}_{item}.bow')
        model = gensim.models.LdaModel.load(f'{models_path}{category}_{item}_{n_docs}_docs_{n_topics}_topics.model')
        corpus_rep_in_topic_space = []
        for bow in corpus:
            z = np.zeros(n_topics)
            for topic, weight in model[bow]:
                z[topic] = weight
            corpus_rep_in_topic_space.append(z.tolist())
        df = pd.DataFrame(corpus_rep_in_topic_space)
        df.columns = topic_tokens_top3[f'{category}_{item}']
        df.to_csv(f'{data_path}{category}_{item}_{n_docs}_docs_{n_topics}_topics_lda_weights.csv', index=False)

CPU times: user 1min 23s, sys: 11.7 s, total: 1min 35s
Wall time: 19.3 s
