# LDA
Gonna try something a little different here from some of the things that I've attempted before. Gonna try some LDA with the `gensim` package and see what sorts of interesting things we can find out. Maybe this could be a back-door way into predicting topics.

In [1]:
import pandas as pd

df = pd.read_csv('data/irishtimes-date-text.csv')
df.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,19960102,business,Smurfit's share price in retreat despite recor...
1,19960102,business,Jamont plans £5m investment to update plant
2,19960102,business,Management is blamed for most company failures
3,19960102,business,Forte expected to announce a special dividend ...
4,19960102,business,Accountancy firm adopts name change


In [2]:
df.dropna(axis='index', how='any', inplace=True)
df['simple_category'] = df.headline_category.str.split('.').str.get(0)
df['headline_text'] = df['headline_text'].str.lower()

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# corpora_path = '/media/sean/634d9e6c-0d5c-4a60-aa43-9d19a276b10a/nltk_data'
# nltk.data.path.append(corpora_path)

nltk.data.find('corpora/wordnet')
nltk.data.find('corpora/stopwords')
stops = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

def clean_headlines(headline):
    tokens = tokenizer.tokenize(headline)
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stops]

df['headline_text'] = df['headline_text'].apply(clean_headlines)
df.head()

Unnamed: 0,publish_date,headline_category,headline_text,simple_category
0,19960102,business,"[smurfit, share, price, retreat, despite, reco...",business
1,19960102,business,"[jamont, plan, 5m, investment, update, plant]",business
2,19960102,business,"[management, blamed, company, failure]",business
3,19960102,business,"[forte, expected, announce, special, dividend,...",business
4,19960102,business,"[accountancy, firm, adopts, name, change]",business


In [6]:
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

texts = df['headline_text'].tolist()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(txt) for txt in texts]
lda = LdaMulticore(corpus, id2word=dictionary, num_topics=6, workers=16)

unable to import 'smart_open.gcs', disabling that module


In [7]:
lda.print_topics(num_words=5)

[(0,
  '0.008*"one" + 0.007*"say" + 0.006*"get" + 0.005*"irish" + 0.005*"health"'),
 (1, '0.007*"000" + 0.007*"short" + 0.007*"1" + 0.006*"pay" + 0.006*"job"'),
 (2, '0.019*"man" + 0.010*"year" + 0.009*"dublin" + 0.007*"go" + 0.007*"two"'),
 (3,
  '0.010*"time" + 0.010*"irish" + 0.009*"back" + 0.008*"first" + 0.008*"right"'),
 (4,
  '0.012*"new" + 0.008*"irish" + 0.008*"ireland" + 0.008*"say" + 0.008*"review"'),
 (5, '0.009*"u" + 0.007*"ireland" + 0.006*"say" + 0.006*"eu" + 0.006*"new"')]

In [13]:
import pyLDAvis as ldavis
import pyLDAvis.gensim as ldgensim

lda_display = ldgensim.prepare(lda, corpus, dictionary, sort_topics=False)
ldavis.display(lda_display)

Given the prevalence of 'Ireland' words in each of the topics ('irish', 'ireland', 'dublin', 'irishman'), I'm going to treat these as stop words and remove them from the headlines before running the analysis again.

In [21]:
import multiprocessing

def remove_ireland_words(headline_lst):
    return [word for word in headline_lst if word not in ('irish', 'ireland', 'dublin', 'irishman', 'u')]

df['headline_text'] = df['headline_text'].apply(remove_ireland_words)

texts = df['headline_text'].tolist()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(txt) for txt in texts]
lda2 = LdaMulticore(corpus, id2word=dictionary, num_topics=6, workers=multiprocessing.cpu_count())

In [22]:
lda2.print_topics(num_words=5)

[(0,
  '0.009*"short" + 0.006*"life" + 0.006*"man" + 0.005*"game" + 0.005*"trial"'),
 (1, '0.011*"new" + 0.007*"uk" + 0.006*"job" + 0.006*"court" + 0.005*"get"'),
 (2,
  '0.015*"man" + 0.011*"year" + 0.008*"000" + 0.008*"child" + 0.008*"world"'),
 (3,
  '0.008*"new" + 0.007*"report" + 0.007*"market" + 0.006*"say" + 0.006*"go"'),
 (4,
  '0.011*"time" + 0.006*"take" + 0.005*"road" + 0.005*"back" + 0.005*"week"'),
 (5,
  '0.011*"say" + 0.006*"review" + 0.005*"eu" + 0.005*"price" + 0.005*"talk"')]

In [23]:
lda_display2 = ldgensim.prepare(lda2, corpus, dictionary, sort_topics=False)
ldavis.display(lda_display2)