In [2]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

# 20 News Groups dataset from sklearn open source

In [5]:
#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np


# Create a Data Frame from the Headlines

In [13]:
documents_df = pd.DataFrame(documents, columns=['Headline'])

In [8]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return (text)


stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])

def remove_stop_words(text):
     return [word for word in text if word not in stop_words]


stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # no single letter words
    except IndexError:
        pass

    return text


def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))


# Headlines DataFrame

In [14]:
documents_df.head(5)

Unnamed: 0,Headline
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


In [19]:

# clean reviews and create new column "tokenized"
import time
t1 = time.time()
documents_df['tokenized_headline'] = documents_df['Headline'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(documents_df), "Headline:", (t2-t1)/60, "min") #Time to clean and tokenize 3209 reviews: 0.21254388093948365 min

print('\n')
print("Headlines with their respective tokenized versions:" )
print(documents_df.head(5))
 

Time to clean and tokenize 11314 reviews: 1.34641166528066 min


reviews with their respective tokenize version:
                                            Headline  \
0  Well i'm not sure about the story nad it did s...   
1  \n\n\n\n\n\n\nYeah, do you expect people to re...   
2  Although I realize that principle is not one o...   
3  Notwithstanding all the legitimate fuss about ...   
4  Well, I will have to change the scoring on my ...   

                                  tokenized_headline  
0  [well, im, sure, stori, nad, bias, whati, disa...  
1  [yeah, expect, read, faq, etc, actual, accept,...  
2  [although, realiz, principl, one, strongestpoi...  
3  [notwithstand, legitim, fuss, propos, muchof, ...  
4  [well, chang, score, playoff, pool, unfortunat...  


In [22]:
#LDA
import gensim
import pyLDAvis.gensim

#Create a Gensim dictionary from the tokenized data
tokenized = documents_df['tokenized_headline']
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized)
#Filter terms which occurs in less than 1 headline and more than 80% of the headlines.
dictionary.filter_extremes(no_below=1, no_above=0.8)
#convert the dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]
print(corpus[:1])
print()
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 4), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 4), (57, 1), (58, 1), (59, 1), (60, 1)]]

[[('act', 1), ('austria', 1), ('away', 1), ('bias', 1), ('blessingreceiv', 1), ('clearli', 1), ('commit', 1), ('daili', 1), ('degre', 1), ('describ', 1), ('disagre', 1), ('europei', 1), ('exist', 1), ('got', 1), ('govern', 1), ('guiltgo', 1), ('holocaust', 1), ('im', 1), ('incid', 1), ('inhuman', 1), ('isra', 1), ('israel', 2), ('isth', 1), ('jew', 1), ('least', 1), ('live', 1), ('look', 1), ('make', 1), ('media', 4), ('might', 1), ('nad', 1), ('occur',

In [23]:
#LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 7, id2word=dictionary, passes=15)
#saving the model
ldamodel.save('model_combined.gensim')
topics = ldamodel.print_topics(num_words=4)
print('\n')
print("Now printing the topics and their composition")
print("This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic")
for topic in topics:
   print(topic)



Now printing the topics and their composition
This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic
(0, '0.008*"key" + 0.008*"use" + 0.007*"one" + 0.005*"system"')
(1, '0.010*"god" + 0.007*"one" + 0.006*"christian" + 0.005*"armenian"')
(2, '0.017*"entri" + 0.010*"file" + 0.008*"xx" + 0.005*"char"')
(3, '0.012*"game" + 0.011*"drive" + 0.009*"team" + 0.008*"card"')
(4, '0.010*"file" + 0.009*"program" + 0.008*"use" + 0.008*"window"')
(5, '0.020*"car" + 0.011*"bike" + 0.006*"ride" + 0.005*"engin"')
(6, '0.009*"one" + 0.009*"dont" + 0.005*"well" + 0.005*"im"')


In [26]:
#finding the similarity of the first review with topics
print('\n')
print("first headline is:")
print(documents_df.Headline[0])
get_document_topics = ldamodel.get_document_topics(corpus[0])
print('\n')
print("The similarity of this review with the topics and respective similarity score are ")
print(get_document_topics)



first headline is:
Well i'm not sure about the story nad it did seem biased. What
I disagree with is your statement that the U.S. Media is out to
ruin Israels reputation. That is rediculous. The U.S. media is
the most pro-israeli media in the world. Having lived in Europe
I realize that incidences such as the one described in the
letter have occured. The U.S. media as a whole seem to try to
ignore them. The U.S. is subsidizing Israels existance and the
Europeans are not (at least not to the same degree). So I think
that might be a reason they report more clearly on the
atrocities.
	What is a shame is that in Austria, daily reports of
the inhuman acts commited by Israeli soldiers and the blessing
received from the Government makes some of the Holocaust guilt
go away. After all, look how the Jews are treating other races
when they got power. It is unfortunate.



The similarity of this review with the topics and respective similarity score are 
[(1, 0.5368451), (3, 0.014184573), (6, 0.

In [None]:
#visualizing topics
lda_viz = gensim.models.ldamodel.LdaModel.load('model_combined.gensim')
lda_display = pyLDAvis.gensim.prepare(lda_viz, corpus, dictionary, sort_topics=True)
pyLDAvis.show(lda_display)