In [1]:
# Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents. Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. 
# It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.
import numpy as np
import pandas as pd
import nltk
import gensim

In [2]:
data = pd.read_csv('news-data.csv', error_bad_lines=False);
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text
documents = data_text[:6000]

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
print("Number of articles:", len(documents))
print(documents[:5])

Number of articles: 6000
                                             content  index
0        And never more so than in Showtime’s new...      0
1        AlphaGo’s victory isn’t a defeat for hum...      1
2        Super Deluxe built a weird internet empi...      2
3        Steven Yang quit his job at Google in th...      3
4        Ahead of Black Panther’s 2018 theatrical...      4


In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#Functions that perform lemmatization and stemming as part of preprocessing
def lemmatize_stemming(text):
    return SnowballStemmer("english").stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [6]:
#Selects the first article to compare before and after preprocessing
doc_sample = documents[documents['index'] == 0].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:20])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample)[:20])

original document: 
['', '', '', '', '', '', 'And', 'never', 'more', 'so', 'than', 'in', 'Showtime’s', 'new', 'series', 'revival', 'Some', 'spoilers', 'ahead', 'through']


 tokenized and lemmatized document: 
['showtim', 'seri', 'reviv', 'spoiler', 'ahead', 'episod', 'season', 'twin', 'peak', 'showtim', 'bring', 'david', 'lynch', 'groundbreak', 'seri', 'twin', 'peak', 'fulfil', 'propheci', 'process']


In [7]:
#All the preprocessed articles
processed_docs = documents['content'].map(preprocess)
processed_docs[:10]

0    [showtim, seri, reviv, spoiler, ahead, episod,...
1    [alphago, victori, defeat, human, opportun, lo...
2    [super, delux, build, weird, internet, empir, ...
3    [steven, yang, quit, googl, summer, build, pro...
4    [ahead, black, panther, theatric, releas, marv...
5    [facebook, instant, articl, promis, transform,...
6    [weapon, weapon, year, artist, technolog, enth...
7    [insid, busi, imagin, futur, dubai, world, loo...
8    [legal, threat, disgruntl, client, insid, uber...
9    [genius, quiet, lay, bunch, engin, surviv, med...
Name: content, dtype: object

In [8]:
#Dictionary that contains the number of times a word appears
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
#Filters words that appears in less than 15 articles
#or in more than 0.5 o the articles
#Keeps first 100 000 most frequent tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

0 abandon
1 abil
2 absenc
3 acknowledg
4 act
5 activ
6 actual
7 adventur
8 age
9 agenc
10 agent


In [9]:
#Create dictionary for each article
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 3),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 8),
 (11, 1),
 (12, 1),
 (13, 2),
 (14, 1),
 (15, 1),
 (16, 2),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 14),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 3),
 (27, 2),
 (28, 2),
 (29, 2),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 2),
 (41, 2),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 3),
 (46, 1),
 (47, 2),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 2),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 6),
 (61, 2),
 (62, 5),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 3),
 (67, 1),
 (68, 1),
 (69, 2),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 6),
 (76, 1),
 (77, 2),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 3),
 (82, 3),
 (83, 1),
 (84, 2),
 (85, 32),
 (86, 1),
 (87, 1),
 (88, 2),
 (89, 1),
 (90, 7),
 (91, 1),
 (92, 3),
 (93, 2),
 (94, 1),
 (95, 2),
 (96, 2),
 (97, 3),
 (98, 1),
 (99, 1),
 (100, 1

In [10]:
#Bag of words for first article?
bow_doc_1 = bow_corpus[0]
for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                               dictionary[bow_doc_1[i][0]], 
bow_doc_1[i][0]))

Word 0 ("abandon") appears 0 time.
Word 1 ("abil") appears 1 time.
Word 2 ("absenc") appears 2 time.
Word 3 ("acknowledg") appears 3 time.
Word 4 ("act") appears 4 time.
Word 5 ("activ") appears 5 time.
Word 6 ("actual") appears 6 time.
Word 7 ("adventur") appears 7 time.
Word 8 ("age") appears 8 time.
Word 9 ("agenc") appears 9 time.
Word 10 ("agent") appears 10 time.
Word 11 ("ahead") appears 11 time.
Word 12 ("alarm") appears 12 time.
Word 13 ("alien") appears 13 time.
Word 14 ("ambiti") appears 14 time.
Word 15 ("angri") appears 15 time.
Word 16 ("anymor") appears 16 time.
Word 17 ("apart") appears 17 time.
Word 18 ("appear") appears 18 time.
Word 19 ("appreci") appears 19 time.
Word 20 ("aspect") appears 20 time.
Word 21 ("attent") appears 21 time.
Word 22 ("audienc") appears 22 time.
Word 23 ("avatar") appears 23 time.
Word 24 ("away") appears 24 time.
Word 25 ("awkward") appears 25 time.
Word 26 ("baffl") appears 26 time.
Word 27 ("basi") appears 27 time.
Word 28 ("beat") appear

In [11]:
#Creates tf-idf model 
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.016998787472690183),
 (1, 0.011938920302744968),
 (2, 0.017899655963463254),
 (3, 0.013068614393794567),
 (4, 0.012922320600284023),
 (5, 0.025828599631591044),
 (6, 0.009072356706132927),
 (7, 0.018254609799108045),
 (8, 0.0164640476635656),
 (9, 0.010214029208323082),
 (10, 0.09784641978660945),
 (11, 0.011447790366833122),
 (12, 0.018003387492813567),
 (13, 0.036006774985627134),
 (14, 0.017171685550249685),
 (15, 0.0180738209759145),
 (16, 0.03593711645937826),
 (17, 0.012827233967582615),
 (18, 0.006905123507381933),
 (19, 0.016970572712410995),
 (20, 0.018038473800795093),
 (21, 0.011700924008309526),
 (22, 0.17860005250544556),
 (23, 0.024065284998395456),
 (24, 0.00709863401074585),
 (25, 0.019143562067200606),
 (26, 0.08545337969324686),
 (27, 0.03371871782472538),
 (28, 0.021132702679937384),
 (29, 0.012120792849488698),
 (30, 0.02276056471433239),
 (31, 0.007723399366035063),
 (32, 0.010844981195764502),
 (33, 0.02040869312244433),
 (34, 0.021147342169059096),
 (35, 0

In [22]:
#Training LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=2, workers=2)

In [23]:
# For each topic, explore the words that occur in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.007*"angel" + 0.007*"california" + 0.006*"citi" + 0.005*"presid" + 0.004*"break" + 0.004*"polic" + 0.004*"trump" + 0.004*"chang" + 0.004*"report" + 0.003*"getti"
Topic: 1 
Words: 0.007*"citi" + 0.006*"angel" + 0.004*"polic" + 0.004*"game" + 0.004*"california" + 0.004*"peopl" + 0.004*"break" + 0.004*"getti" + 0.003*"train" + 0.003*"uber"
Topic: 2 
Words: 0.004*"peopl" + 0.003*"human" + 0.003*"angel" + 0.003*"start" + 0.003*"work" + 0.003*"report" + 0.003*"area" + 0.003*"california" + 0.003*"offici" + 0.003*"home"
Topic: 3 
Words: 0.011*"student" + 0.010*"presid" + 0.009*"school" + 0.009*"california" + 0.008*"trump" + 0.007*"break" + 0.006*"angel" + 0.006*"getti" + 0.005*"member" + 0.005*"program"
Topic: 4 
Words: 0.015*"water" + 0.008*"round" + 0.007*"land" + 0.007*"punch" + 0.006*"look" + 0.005*"fight" + 0.005*"california" + 0.005*"get" + 0.004*"takedown" + 0.004*"right"
Topic: 5 
Words: 0.009*"season" + 0.009*"play" + 0.006*"game" + 0.006*"angel" + 0.005*"team" + 0.

In [24]:
#LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=30, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.002*"game" + 0.002*"appl" + 0.002*"compani" + 0.001*"team" + 0.001*"dodger" + 0.001*"polic" + 0.001*"california" + 0.001*"state" + 0.001*"chef" + 0.001*"amprsquo"
Topic: 1 Word: 0.002*"point" + 0.002*"game" + 0.002*"amprsquo" + 0.002*"polic" + 0.002*"score" + 0.002*"california" + 0.001*"trump" + 0.001*"clipper" + 0.001*"rain" + 0.001*"divis"
Topic: 2 Word: 0.003*"california" + 0.002*"immigr" + 0.002*"dodger" + 0.002*"trump" + 0.002*"getti" + 0.002*"ktla" + 0.002*"presid" + 0.002*"credit" + 0.002*"polic" + 0.002*"southern"
Topic: 3 Word: 0.003*"california" + 0.003*"getti" + 0.003*"ktla" + 0.003*"polic" + 0.003*"credit" + 0.003*"presid" + 0.002*"wave" + 0.002*"heat" + 0.002*"lapd" + 0.002*"trump"
Topic: 4 Word: 0.002*"laker" + 0.002*"favorit" + 0.002*"robot" + 0.002*"game" + 0.002*"justin" + 0.002*"season" + 0.002*"golf" + 0.002*"appl" + 0.002*"point" + 0.001*"inning"
Topic: 5 Word: 0.003*"alford" + 0.003*"ucla" + 0.003*"bruin" + 0.003*"laker" + 0.003*"game" + 0.002*"sco

In [25]:
#Getting the category of the first article

# print(processed_docs[0]) #This is the article
for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))
# The topic with the highest score/probability is the topic most likely associated
# with it


Score: 0.45125436782836914	 
Topic: 0.011*"play" + 0.008*"game" + 0.007*"round" + 0.007*"hole" + 0.005*"player" + 0.005*"shoot" + 0.005*"team" + 0.005*"dodger" + 0.005*"duck" + 0.005*"final"

Score: 0.14750249683856964	 
Topic: 0.006*"work" + 0.005*"film" + 0.004*"includ" + 0.004*"home" + 0.004*"citi" + 0.004*"play" + 0.004*"live" + 0.003*"take" + 0.003*"million" + 0.003*"look"

Score: 0.1388128399848938	 
Topic: 0.006*"state" + 0.004*"work" + 0.004*"peopl" + 0.004*"million" + 0.004*"compani" + 0.004*"california" + 0.003*"build" + 0.003*"want" + 0.003*"offici" + 0.003*"world"

Score: 0.12325345724821091	 
Topic: 0.006*"second" + 0.005*"peopl" + 0.005*"open" + 0.004*"shoot" + 0.004*"play" + 0.004*"want" + 0.004*"point" + 0.004*"leav" + 0.003*"think" + 0.003*"work"

Score: 0.07749078422784805	 
Topic: 0.009*"amprsquo" + 0.007*"team" + 0.005*"compani" + 0.005*"want" + 0.005*"work" + 0.005*"peopl" + 0.004*"think" + 0.004*"organ" + 0.003*"ampmdash" + 0.003*"thing"

Score: 0.036733228713274

In [26]:
#Getting the topic of the first article with the LDA TF-IDF model
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8168314099311829	 
Topic: 0.003*"game" + 0.002*"team" + 0.002*"dodger" + 0.002*"laker" + 0.002*"season" + 0.002*"compani" + 0.001*"play" + 0.001*"california" + 0.001*"film" + 0.001*"state"

Score: 0.07523132860660553	 
Topic: 0.002*"point" + 0.002*"game" + 0.002*"amprsquo" + 0.002*"polic" + 0.002*"score" + 0.002*"california" + 0.001*"trump" + 0.001*"clipper" + 0.001*"rain" + 0.001*"divis"

Score: 0.04065573588013649	 
Topic: 0.006*"clipper" + 0.005*"game" + 0.005*"ram" + 0.004*"season" + 0.004*"charger" + 0.004*"team" + 0.003*"klein" + 0.003*"point" + 0.003*"minicamp" + 0.003*"offseason"

Score: 0.022257493808865547	 
Topic: 0.005*"trump" + 0.005*"california" + 0.005*"getti" + 0.004*"ktla" + 0.004*"presid" + 0.004*"credit" + 0.003*"heat" + 0.003*"polic" + 0.003*"wave" + 0.003*"lapd"

Score: 0.02047869749367237	 
Topic: 0.003*"devic" + 0.003*"trump" + 0.002*"amprsquo" + 0.002*"california" + 0.002*"window" + 0.002*"polic" + 0.002*"microsoft" + 0.002*"presid" + 0.002*"phone" + 0