In [88]:
# Topic modeling is a type of statistical modeling for discovering the abstract “topics” that occur in a collection of documents. Latent Dirichlet Allocation (LDA) is an example of topic model and is used to classify text in a document to a particular topic. 
# It builds a topic per document model and words per topic model, modeled as Dirichlet distributions.
import numpy as np
import pandas as pd
import nltk
import gensim

In [89]:
data = pd.read_csv('news-data.csv', error_bad_lines=False);
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text
# documents = data_text[:1000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [90]:
print("Number of articles:", len(documents))
print(documents[:5])

Number of articles: 1000
                                             content  index
0        And never more so than in Showtime’s new...      0
1        AlphaGo’s victory isn’t a defeat for hum...      1
2        Super Deluxe built a weird internet empi...      2
3        Steven Yang quit his job at Google in th...      3
4        Ahead of Black Panther’s 2018 theatrical...      4


In [91]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\owner\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [92]:
#Functions that perform lemmatization and stemming as part of preprocessing
def lemmatize_stemming(text):
    return SnowballStemmer("english").stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [93]:
#Selects the first article to compare before and after preprocessing
doc_sample = documents[documents['index'] == 0].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words[:20])
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample)[:20])

original document: 
['', '', '', '', '', '', 'And', 'never', 'more', 'so', 'than', 'in', 'Showtime’s', 'new', 'series', 'revival', 'Some', 'spoilers', 'ahead', 'through']


 tokenized and lemmatized document: 
['showtim', 'seri', 'reviv', 'spoiler', 'ahead', 'episod', 'season', 'twin', 'peak', 'showtim', 'bring', 'david', 'lynch', 'groundbreak', 'seri', 'twin', 'peak', 'fulfil', 'propheci', 'process']


In [94]:
#All the preprocessed articles
processed_docs = documents['content'].map(preprocess)
processed_docs[:10]

0    [showtim, seri, reviv, spoiler, ahead, episod,...
1    [alphago, victori, defeat, human, opportun, lo...
2    [super, delux, build, weird, internet, empir, ...
3    [steven, yang, quit, googl, summer, build, pro...
4    [ahead, black, panther, theatric, releas, marv...
5    [facebook, instant, articl, promis, transform,...
6    [weapon, weapon, year, artist, technolog, enth...
7    [insid, busi, imagin, futur, dubai, world, loo...
8    [legal, threat, disgruntl, client, insid, uber...
9    [genius, quiet, lay, bunch, engin, surviv, med...
Name: content, dtype: object

In [95]:
#Dictionary that contains the number of times a word appears
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
#Filters words that appears in less than 15 articles
#or in more than 0.5 o the articles
#Keeps first 100 000 most frequent tokens
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

0 abandon
1 abil
2 absenc
3 acknowledg
4 act
5 activ
6 actual
7 adventur
8 age
9 agenc
10 agent


In [96]:
#Create dictionary for each article
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]

[(1, 3),
 (6, 7),
 (14, 1),
 (27, 1),
 (30, 2),
 (38, 2),
 (46, 1),
 (68, 1),
 (70, 2),
 (81, 1),
 (82, 1),
 (88, 1),
 (91, 2),
 (94, 2),
 (104, 1),
 (111, 1),
 (114, 2),
 (119, 1),
 (120, 2),
 (121, 2),
 (122, 1),
 (123, 1),
 (129, 1),
 (134, 1),
 (145, 1),
 (147, 1),
 (149, 1),
 (152, 1),
 (155, 3),
 (158, 4),
 (160, 1),
 (165, 13),
 (167, 3),
 (171, 1),
 (172, 2),
 (180, 5),
 (181, 8),
 (183, 15),
 (192, 2),
 (194, 3),
 (204, 1),
 (206, 4),
 (211, 3),
 (216, 1),
 (219, 7),
 (226, 1),
 (227, 1),
 (228, 1),
 (229, 2),
 (237, 3),
 (240, 1),
 (241, 2),
 (243, 2),
 (244, 14),
 (245, 2),
 (246, 1),
 (247, 4),
 (248, 5),
 (250, 4),
 (252, 2),
 (253, 1),
 (256, 1),
 (264, 1),
 (272, 1),
 (278, 2),
 (279, 1),
 (280, 1),
 (282, 3),
 (283, 2),
 (285, 1),
 (297, 1),
 (300, 1),
 (302, 1),
 (303, 1),
 (309, 2),
 (323, 1),
 (328, 1),
 (332, 1),
 (353, 2),
 (373, 1),
 (383, 2),
 (384, 2),
 (385, 1),
 (386, 2),
 (387, 3),
 (388, 1),
 (389, 1),
 (390, 1),
 (391, 1),
 (392, 1),
 (393, 1),
 (394, 1),
 

In [98]:
#Bag of words for first article?
bow_doc_1 = bow_corpus[0]
for i in range(len(bow_doc_1)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_1[i][0], 
                                               dictionary[bow_doc_1[i][0]], 
bow_doc_1[i][0]))

Word 0 ("abandon") appears 1 time.
Word 1 ("abil") appears 1 time.
Word 2 ("absenc") appears 1 time.
Word 3 ("acknowledg") appears 1 time.
Word 4 ("act") appears 1 time.
Word 5 ("activ") appears 3 time.
Word 6 ("actual") appears 1 time.
Word 7 ("adventur") appears 1 time.
Word 8 ("age") appears 1 time.
Word 9 ("agenc") appears 1 time.
Word 10 ("agent") appears 8 time.
Word 11 ("ahead") appears 1 time.
Word 12 ("alarm") appears 1 time.
Word 13 ("alien") appears 2 time.
Word 14 ("ambiti") appears 1 time.
Word 15 ("angri") appears 1 time.
Word 16 ("anymor") appears 2 time.
Word 17 ("apart") appears 1 time.
Word 18 ("appear") appears 1 time.
Word 19 ("appreci") appears 1 time.
Word 20 ("aspect") appears 1 time.
Word 21 ("attent") appears 1 time.
Word 22 ("audienc") appears 14 time.
Word 23 ("avatar") appears 1 time.
Word 24 ("away") appears 1 time.
Word 25 ("awkward") appears 1 time.
Word 26 ("basi") appears 2 time.
Word 27 ("beat") appears 2 time.
Word 28 ("begin") appears 2 time.
Word 29

In [99]:
#Creates tf-idf model 
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.016274810763693602),
 (1, 0.009752602121677546),
 (2, 0.021403971932895206),
 (3, 0.014654855658424818),
 (4, 0.015229934549595809),
 (5, 0.025318550747722168),
 (6, 0.004650247572742471),
 (7, 0.017181388166074656),
 (8, 0.017059615567358837),
 (9, 0.011303701108645176),
 (10, 0.13190344406374296),
 (11, 0.012702182891273724),
 (12, 0.019615668480175343),
 (13, 0.03512727671074695),
 (14, 0.014351194449319924),
 (15, 0.02066415057152873),
 (16, 0.034119231134717674),
 (17, 0.012279850493429119),
 (18, 0.007549267394552347),
 (19, 0.017974228402921635),
 (20, 0.015059446322379032),
 (21, 0.01054792074380748),
 (22, 0.14978166073903681),
 (23, 0.021674290432295726),
 (24, 0.0061257040994261314),
 (25, 0.018899779899900764),
 (26, 0.029626328868496594),
 (27, 0.02900188984458125),
 (28, 0.011320774867018935),
 (29, 0.022258033015939926),
 (30, 0.007549267394552347),
 (31, 0.008568070947792553),
 (32, 0.01981008948848401),
 (33, 0.023311185400599756),
 (34, 0.016709577272328753),
 

In [100]:
#Training LDA model
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [101]:
# For each topic, explore the words that occur in that topic and its relative weight
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.006*"research" + 0.006*"experi" + 0.004*"human" + 0.004*"test" + 0.004*"design" + 0.004*"studi" + 0.003*"data" + 0.003*"build" + 0.003*"differ" + 0.003*"tell"
Topic: 1 
Words: 0.013*"game" + 0.004*"life" + 0.004*"build" + 0.004*"video" + 0.004*"live" + 0.004*"play" + 0.003*"start" + 0.003*"differ" + 0.003*"develop" + 0.003*"amprsquo"
Topic: 2 
Words: 0.007*"amprsquo" + 0.004*"build" + 0.004*"game" + 0.003*"tell" + 0.003*"facebook" + 0.003*"product" + 0.003*"user" + 0.003*"busi" + 0.003*"case" + 0.003*"servic"
Topic: 3 
Words: 0.005*"twitter" + 0.005*"user" + 0.004*"develop" + 0.004*"technolog" + 0.004*"build" + 0.004*"facebook" + 0.004*"product" + 0.004*"chang" + 0.003*"amprsquo" + 0.003*"app"
Topic: 4 
Words: 0.014*"game" + 0.009*"window" + 0.008*"microsoft" + 0.005*"ampmdash" + 0.005*"amprsquo" + 0.005*"googl" + 0.004*"devic" + 0.004*"experi" + 0.004*"phone" + 0.004*"user"
Topic: 5 
Words: 0.008*"amprsquo" + 0.006*"googl" + 0.005*"phone" + 0.004*"servic" + 0.004*"g

In [102]:
#LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"amprsquo" + 0.005*"game" + 0.004*"window" + 0.004*"microsoft" + 0.003*"ampmdash" + 0.003*"amazon" + 0.003*"amprsquot" + 0.002*"twitter" + 0.002*"amprdquo" + 0.002*"attack"
Topic: 1 Word: 0.007*"microsoft" + 0.004*"xbox" + 0.004*"amprsquo" + 0.004*"appl" + 0.003*"window" + 0.002*"game" + 0.002*"youtub" + 0.002*"oculus" + 0.002*"robot" + 0.002*"consol"
Topic: 2 Word: 0.003*"game" + 0.003*"facebook" + 0.003*"googl" + 0.002*"amprsquo" + 0.002*"photo" + 0.002*"appl" + 0.002*"ampmdash" + 0.002*"user" + 0.002*"phone" + 0.002*"bitcoin"
Topic: 3 Word: 0.006*"amprsquo" + 0.003*"cell" + 0.003*"game" + 0.003*"patient" + 0.003*"ampmdash" + 0.002*"drone" + 0.002*"drug" + 0.002*"amprsquot" + 0.002*"research" + 0.002*"treatment"
Topic: 4 Word: 0.003*"amprsquo" + 0.002*"planet" + 0.002*"genom" + 0.002*"studi" + 0.002*"game" + 0.002*"ampmdash" + 0.002*"data" + 0.002*"water" + 0.002*"facebook" + 0.002*"speci"
Topic: 5 Word: 0.003*"game" + 0.002*"amphellip" + 0.002*"ampmdash" + 0.002

In [110]:
#Getting the category of the first article

# print(processed_docs[0]) #This is the article
for index, score in sorted(lda_model[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))
# The topic with the highest score/probability is the topic most likely associated
# with it


Score: 0.9987696409225464	 
Topic: 0.013*"game" + 0.004*"life" + 0.004*"build" + 0.004*"video" + 0.004*"live" + 0.004*"play" + 0.003*"start" + 0.003*"differ" + 0.003*"develop" + 0.003*"amprsquo"


In [112]:
#Getting the topic of the first article with the LDA TF-IDF model
for index, score in sorted(lda_model_tfidf[bow_corpus[0]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8024715185165405	 
Topic: 0.003*"game" + 0.003*"facebook" + 0.003*"googl" + 0.002*"amprsquo" + 0.002*"photo" + 0.002*"appl" + 0.002*"ampmdash" + 0.002*"user" + 0.002*"phone" + 0.002*"bitcoin"

Score: 0.12606246769428253	 
Topic: 0.003*"game" + 0.002*"amphellip" + 0.002*"ampmdash" + 0.002*"infect" + 0.002*"film" + 0.002*"genet" + 0.002*"drone" + 0.002*"read" + 0.002*"drug" + 0.002*"amprsquo"

Score: 0.03714098781347275	 
Topic: 0.003*"amprsquo" + 0.002*"planet" + 0.002*"genom" + 0.002*"studi" + 0.002*"game" + 0.002*"ampmdash" + 0.002*"data" + 0.002*"water" + 0.002*"facebook" + 0.002*"speci"

Score: 0.028198009356856346	 
Topic: 0.007*"amprsquo" + 0.005*"game" + 0.004*"window" + 0.004*"microsoft" + 0.003*"ampmdash" + 0.003*"amazon" + 0.003*"amprsquot" + 0.002*"twitter" + 0.002*"amprdquo" + 0.002*"attack"
