In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('news.csv', error_bad_lines=False);
data_text = data[['content']]
data_text['index'] = data_text.index
documents = data_text

In [2]:
print(len(documents))
print(documents[:9])

50000
                                             content  index
0  WASHINGTON  —   Congressional Republicans have...      0
1  After the bullet shells get counted, the blood...      1
2  When Walt Disney’s “Bambi” opened in 1942, cri...      2
3  Death may be the great equalizer, but it isn’t...      3
4  SEOUL, South Korea  —   North Korea’s leader, ...      4
5  LONDON  —   Queen Elizabeth II, who has been b...      5
6  BEIJING  —   President Tsai   of Taiwan sharpl...      6
7  Danny Cahill stood, slightly dazed, in a blizz...      7
8  Just how   is Hillary Kerr, the    founder of ...      8


In [3]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk import PorterStemmer
PorterStemmer().stem('complications')
import numpy as np
np.random.seed(2018)
import pandas as pd
stemmer = SnowballStemmer("english")
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subhg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [5]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['July', '1939.', 'The', 'world', 'teetered', 'on', 'the', 'brink', 'of', 'war', 'as', 'Hitler', 'menaced', 'Poland.', 'The', '11', 'millionth', 'visitor', 'passed', 'through', 'the', 'turnstiles', 'of', 'the', 'New', 'York', 'World’s', 'Fair.', 'Baseball', 'fans', 'still', 'reeled', 'after', 'Lou', 'Gehrig’s', '“luckiest', 'man”', 'speech', 'at', 'Yankee', 'Stadium.', 'But', 'many', 'Americans', 'could', 'think', 'only', 'of', 'Donn', 'Fendler,', 'a', '', '', '', 'boy', 'lost', 'on', 'Mount', 'Katahdin', 'in', 'Maine,', 'the', 'object', 'of', 'a', 'frantic', 'search', 'and', 'rescue', 'operation', 'that', 'dragged', 'on', 'for', 'nine', 'days,', 'monopolizing', 'the', 'radio', 'airwaves', 'and', 'newspaper', 'headlines.', 'Thousands', 'of', 'mothers', 'sent', 'prayers', 'by', 'Western', 'Union', 'to', 'the', 'boy’s', 'mother.', 'Boy', 'Scouts', 'joined', 'the', 'search,', 'along', 'with', 'workers', 'from', 'the', 'Millinocket', 'paper', 'mill.', 'The', 'New', 'Yor

['juli', 'world', 'teeter', 'brink', 'hitler', 'menac', 'poland', 'millionth', 'visitor', 'pass', 'turnstil', 'york', 'world', 'fair', 'basebal', 'fan', 'reel', 'gehrig', 'luckiest', 'speech', 'yanke', 'stadium', 'american', 'think', 'donn', 'fendler', 'lose', 'mount', 'katahdin', 'main', 'object', 'frantic', 'search', 'rescu', 'oper', 'drag', 'day', 'monopol', 'radio', 'airwav', 'newspap', 'headlin', 'thousand', 'mother', 'send', 'prayer', 'western', 'union', 'mother', 'scout', 'join', 'search', 'worker', 'millinocket', 'paper', 'york', 'state', 'polic', 'dispatch', 'best', 'bloodhound', 'heart', 'sink', 'donn', 'footprint', 'disappear', 'edg', 'sheer', 'precipic', 'call', 'saddl', 'slip', 'tri', 'believ', 'faint', 'thread', 'hope', 'despond', 'father', 'donald', 'tell', 'boston', 'globe', 'spirit', 'lift', 'footprint', 'appear', 'near', 'mountain', 'base', 'day', 'later', 'final', 'ninth', 'miracl', 'strip', 'nake', 'day', 'fear', 'battl', 'main', 'wilder', 'slop', 'mount', 'katahdin

In [6]:
processed_docs = documents['content'].map(preprocess)
processed_docs[:10]

0    [washington, congression, republican, fear, co...
1    [bullet, shell, count, blood, dri, votiv, cand...
2    [walt, disney, bambi, open, critic, prais, spa...
3    [death, great, equal, necessarili, evenhand, f...
4    [seoul, south, korea, north, korea, leader, sa...
5    [london, queen, elizabeth, battl, cold, week, ...
6    [beij, presid, tsai, taiwan, sharpli, critic, ...
7    [danni, cahil, stand, slight, daze, blizzard, ...
8    [hillari, kerr, founder, digit, media, compani...
9    [angel, muñiz, famili, apart, bronx, paint, an...
Name: content, dtype: object

In [7]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 access
1 acknowledg
2 administr
3 advoc
4 afford
5 alli
6 american
7 anger
8 annual
9 anticip
10 appeal


In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(6, 1),
 (22, 1),
 (36, 2),
 (51, 1),
 (74, 2),
 (77, 1),
 (89, 1),
 (101, 1),
 (106, 1),
 (108, 1),
 (127, 4),
 (132, 1),
 (138, 7),
 (145, 4),
 (146, 1),
 (154, 1),
 (184, 2),
 (187, 1),
 (193, 1),
 (196, 1),
 (201, 2),
 (209, 3),
 (213, 3),
 (226, 6),
 (237, 1),
 (243, 1),
 (248, 1),
 (253, 1),
 (269, 1),
 (272, 3),
 (275, 1),
 (288, 1),
 (295, 2),
 (300, 2),
 (316, 1),
 (325, 1),
 (331, 1),
 (338, 3),
 (356, 1),
 (359, 2),
 (362, 1),
 (370, 1),
 (391, 2),
 (400, 1),
 (415, 1),
 (416, 4),
 (420, 1),
 (458, 1),
 (459, 1),
 (462, 1),
 (464, 1),
 (479, 1),
 (480, 1),
 (481, 3),
 (500, 1),
 (507, 1),
 (514, 3),
 (541, 2),
 (544, 3),
 (549, 1),
 (553, 1),
 (564, 1),
 (565, 1),
 (570, 1),
 (597, 1),
 (598, 1),
 (607, 1),
 (616, 2),
 (621, 1),
 (625, 1),
 (630, 1),
 (637, 1),
 (648, 1),
 (653, 1),
 (662, 4),
 (664, 1),
 (670, 1),
 (679, 1),
 (691, 1),
 (702, 1),
 (708, 1),
 (725, 1),
 (729, 1),
 (730, 1),
 (736, 1),
 (754, 1),
 (795, 1),
 (797, 1),
 (806, 1),
 (815, 1),
 (820, 1),
 (829, 

In [10]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 6 ("american") appears 1 time.
Word 22 ("branch") appears 1 time.
Word 36 ("come") appears 2 time.
Word 51 ("continu") appears 1 time.
Word 74 ("donald") appears 2 time.
Word 77 ("eager") appears 1 time.
Word 89 ("fear") appears 1 time.
Word 101 ("hand") appears 1 time.
Word 106 ("hous") appears 1 time.
Word 108 ("illustr") appears 1 time.
Word 127 ("later") appears 4 time.
Word 132 ("leav") appears 1 time.
Word 138 ("lose") appears 7 time.
Word 145 ("mount") appears 4 time.
Word 146 ("nation") appears 1 time.
Word 154 ("pass") appears 1 time.
Word 184 ("quick") appears 2 time.
Word 187 ("receiv") appears 1 time.
Word 193 ("report") appears 1 time.
Word 196 ("requir") appears 1 time.
Word 201 ("right") appears 2 time.
Word 209 ("spend") appears 3 time.
Word 213 ("state") appears 3 time.
Word 226 ("tell") appears 6 time.
Word 237 ("view") appears 1 time.
Word 243 ("white") appears 1 time.
Word 248 ("account") appears 1 time.
Word 253 ("addit") appears 1 time.
Word 269 ("appear") ap

In [11]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.026770545983534913),
 (1, 0.029068893675775774),
 (2, 0.23280474767240217),
 (3, 0.0318152884632238),
 (4, 0.03317955987601938),
 (5, 0.028340257009214845),
 (6, 0.011822358013442068),
 (7, 0.03651313378109811),
 (8, 0.03134841141839632),
 (9, 0.08219950978886159),
 (10, 0.11833316065832819),
 (11, 0.1068948542622624),
 (12, 0.029414800654264807),
 (13, 0.03979115382242141),
 (14, 0.034294964747078195),
 (15, 0.03491551839627701),
 (16, 0.028504937535025775),
 (17, 0.04794320784817166),
 (18, 0.0430859100761559),
 (19, 0.03715714840084949),
 (20, 0.052708727883302366),
 (21, 0.05897435106024678),
 (22, 0.19674943192832461),
 (23, 0.03565688419697066),
 (24, 0.03846151024366051),
 (25, 0.16461063075731858),
 (26, 0.06336353656669406),
 (27, 0.06766757634740564),
 (28, 0.04165525415334854),
 (29, 0.028027620454488433),
 (30, 0.02319742916334767),
 (31, 0.03607625005938099),
 (32, 0.04023225268361473),
 (33, 0.05157570903473344),
 (34, 0.044666235535910065),
 (35, 0.043254521458515

In [12]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [13]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"health" + 0.006*"drug" + 0.005*"case" + 0.005*"medic" + 0.005*"tell" + 0.005*"children" + 0.005*"care" + 0.005*"women" + 0.004*"famili" + 0.004*"know"
Topic: 1 
Words: 0.014*"state" + 0.008*"north" + 0.007*"court" + 0.006*"korea" + 0.005*"report" + 0.005*"offici" + 0.005*"texa" + 0.005*"unit" + 0.004*"prison" + 0.004*"border"
Topic: 2 
Words: 0.017*"trump" + 0.007*"think" + 0.007*"news" + 0.007*"white" + 0.007*"media" + 0.006*"women" + 0.006*"polit" + 0.006*"black" + 0.005*"know" + 0.005*"right"
Topic: 3 
Words: 0.040*"trump" + 0.016*"clinton" + 0.013*"presid" + 0.010*"republican" + 0.009*"campaign" + 0.008*"state" + 0.008*"democrat" + 0.007*"elect" + 0.007*"donald" + 0.007*"obama"
Topic: 4 
Words: 0.017*"polic" + 0.011*"attack" + 0.009*"offic" + 0.008*"kill" + 0.007*"report" + 0.007*"isi" + 0.006*"shoot" + 0.006*"tell" + 0.005*"group" + 0.005*"state"
Topic: 5 
Words: 0.010*"state" + 0.008*"countri" + 0.008*"govern" + 0.006*"american" + 0.005*"polici" + 0.005*"f

In [14]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.007*"trump" + 0.004*"clinton" + 0.003*"milo" + 0.002*"women" + 0.002*"donald" + 0.002*"twitter" + 0.002*"hillari" + 0.002*"presid" + 0.002*"yiannopoulo" + 0.002*"obama"
Topic: 1 Word: 0.007*"trump" + 0.005*"clinton" + 0.003*"comey" + 0.003*"presid" + 0.003*"obama" + 0.003*"russia" + 0.003*"russian" + 0.003*"email" + 0.002*"immigr" + 0.002*"investig"
Topic: 2 Word: 0.005*"polic" + 0.003*"shoot" + 0.002*"offic" + 0.002*"citi" + 0.002*"store" + 0.002*"kill" + 0.001*"counti" + 0.001*"arrest" + 0.001*"uber" + 0.001*"home"
Topic: 3 Word: 0.009*"trump" + 0.007*"clinton" + 0.005*"poll" + 0.004*"vote" + 0.004*"republican" + 0.004*"voter" + 0.004*"cruz" + 0.004*"campaign" + 0.004*"parti" + 0.003*"elect"
Topic: 4 Word: 0.013*"tesla" + 0.006*"musk" + 0.006*"airlin" + 0.006*"flight" + 0.004*"plane" + 0.004*"passeng" + 0.003*"aircraft" + 0.003*"model" + 0.003*"compani" + 0.002*"googl"
Topic: 5 Word: 0.003*"isi" + 0.003*"syria" + 0.003*"islam" + 0.003*"trump" + 0.003*"syrian" + 0.002

In [15]:
processed_docs[4310]

['juli',
 'world',
 'teeter',
 'brink',
 'hitler',
 'menac',
 'poland',
 'millionth',
 'visitor',
 'pass',
 'turnstil',
 'york',
 'world',
 'fair',
 'basebal',
 'fan',
 'reel',
 'gehrig',
 'luckiest',
 'speech',
 'yanke',
 'stadium',
 'american',
 'think',
 'donn',
 'fendler',
 'lose',
 'mount',
 'katahdin',
 'main',
 'object',
 'frantic',
 'search',
 'rescu',
 'oper',
 'drag',
 'day',
 'monopol',
 'radio',
 'airwav',
 'newspap',
 'headlin',
 'thousand',
 'mother',
 'send',
 'prayer',
 'western',
 'union',
 'mother',
 'scout',
 'join',
 'search',
 'worker',
 'millinocket',
 'paper',
 'york',
 'state',
 'polic',
 'dispatch',
 'best',
 'bloodhound',
 'heart',
 'sink',
 'donn',
 'footprint',
 'disappear',
 'edg',
 'sheer',
 'precipic',
 'call',
 'saddl',
 'slip',
 'tri',
 'believ',
 'faint',
 'thread',
 'hope',
 'despond',
 'father',
 'donald',
 'tell',
 'boston',
 'globe',
 'spirit',
 'lift',
 'footprint',
 'appear',
 'near',
 'mountain',
 'base',
 'day',
 'later',
 'final',
 'ninth',
 '

In [16]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.5887055397033691	 
Topic: 0.005*"go" + 0.005*"know" + 0.005*"play" + 0.005*"game" + 0.005*"tell" + 0.004*"come" + 0.004*"film" + 0.004*"star" + 0.004*"love" + 0.004*"think"

Score: 0.3047526180744171	 
Topic: 0.005*"go" + 0.005*"work" + 0.005*"world" + 0.004*"think" + 0.004*"build" + 0.004*"look" + 0.004*"come" + 0.004*"know" + 0.004*"team" + 0.004*"water"

Score: 0.061553653329610825	 
Topic: 0.017*"polic" + 0.011*"attack" + 0.009*"offic" + 0.008*"kill" + 0.007*"report" + 0.007*"isi" + 0.006*"shoot" + 0.006*"tell" + 0.005*"group" + 0.005*"state"

Score: 0.04380832612514496	 
Topic: 0.009*"health" + 0.006*"drug" + 0.005*"case" + 0.005*"medic" + 0.005*"tell" + 0.005*"children" + 0.005*"care" + 0.005*"women" + 0.004*"famili" + 0.004*"know"


In [17]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.8587159514427185	 
Topic: 0.005*"polic" + 0.003*"shoot" + 0.002*"offic" + 0.002*"citi" + 0.002*"store" + 0.002*"kill" + 0.001*"counti" + 0.001*"arrest" + 0.001*"uber" + 0.001*"home"

Score: 0.07600127905607224	 
Topic: 0.005*"appl" + 0.004*"game" + 0.003*"film" + 0.002*"season" + 0.002*"team" + 0.002*"movi" + 0.002*"player" + 0.002*"play" + 0.002*"star" + 0.002*"iphon"

Score: 0.034065697342157364	 
Topic: 0.007*"trump" + 0.004*"clinton" + 0.003*"milo" + 0.002*"women" + 0.002*"donald" + 0.002*"twitter" + 0.002*"hillari" + 0.002*"presid" + 0.002*"yiannopoulo" + 0.002*"obama"

Score: 0.030035046860575676	 
Topic: 0.009*"trump" + 0.007*"clinton" + 0.005*"poll" + 0.004*"vote" + 0.004*"republican" + 0.004*"voter" + 0.004*"cruz" + 0.004*"campaign" + 0.004*"parti" + 0.003*"elect"


In [18]:
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6424863338470459	 Topic: 0.010*"state" + 0.008*"countri" + 0.008*"govern" + 0.006*"american" + 0.005*"polici"
Score: 0.22413262724876404	 Topic: 0.009*"report" + 0.008*"email" + 0.008*"facebook" + 0.007*"student" + 0.007*"news"
Score: 0.016675977036356926	 Topic: 0.017*"compani" + 0.009*"busi" + 0.008*"market" + 0.007*"million" + 0.006*"price"
Score: 0.016672976315021515	 Topic: 0.014*"state" + 0.008*"north" + 0.007*"court" + 0.006*"korea" + 0.005*"report"
Score: 0.016672879457473755	 Topic: 0.040*"trump" + 0.016*"clinton" + 0.013*"presid" + 0.010*"republican" + 0.009*"campaign"
Score: 0.016672739759087563	 Topic: 0.017*"polic" + 0.011*"attack" + 0.009*"offic" + 0.008*"kill" + 0.007*"report"
Score: 0.016672534868121147	 Topic: 0.017*"trump" + 0.007*"think" + 0.007*"news" + 0.007*"white" + 0.007*"media"
Score: 0.0166714396327734	 Topic: 0.009*"health" + 0.006*"drug" + 0.005*"case" + 0.005*"medic" + 0.005*"tell"
Score: 0.016671346500515938	 Topic: 0.005*"go" + 0.005*"work" + 0.0