# 20 News Groups dataset from sklearn open source

In [21]:
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

# Loading gensim and nltk libraries

In [22]:
#text processing
import re
import string
import nltk
from gensim import corpora, models, similarities
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np


# Data Pre-processing
### We will perform the following steps:
* Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
* Words that have fewer than 3 characters are removed.
* All stopwords are removed.
* Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
* Words are stemmed — words are reduced to their root form.

In [23]:
documents_df = pd.DataFrame(documents, columns=['Headline'])

In [24]:
def initial_clean(text):
    """
    Function to clean text-remove punctuations, lowercase text etc.
    """
    text = re.sub("[^a-zA-Z ]", "", text)
    text = text.lower()  # lower case text
    text = nltk.word_tokenize(text)
    return (text)


stop_words = stopwords.words('english')
stop_words.extend(['news', 'say','use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do','took','time','year',
'done', 'try', 'many', 'some','nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line','even', 'also', 'may', 'take', 'come', 'new','said', 'like','people'])

def remove_stop_words(text):
     return [word for word in text if word not in stop_words]


stemmer = PorterStemmer()
def stem_words(text):
    """
    Function to stem words
    """
    try:
        text = [stemmer.stem(word) for word in text]
        text = [word for word in text if len(word) > 1] # no single letter words
    except IndexError:
        pass

    return text


def apply_all(text):
    """
    This function applies all the functions above into one
    """
    return stem_words(remove_stop_words(initial_clean(text)))


# Headlines DataFrame

In [25]:
documents_df.head(5)

Unnamed: 0,Headline
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."
2,Although I realize that principle is not one o...
3,Notwithstanding all the legitimate fuss about ...
4,"Well, I will have to change the scoring on my ..."


In [26]:

# clean reviews and create new column "tokenized"
import time
t1 = time.time()
documents_df['tokenized_headline'] = documents_df['Headline'].apply(apply_all)
t2 = time.time()
print("Time to clean and tokenize", len(documents_df), "Headline:", (t2-t1)/60, "min") #Time to clean and tokenize 3209 reviews: 0.21254388093948365 min

print('\n')
print("Headlines with their respective tokenized versions:" )
print(documents_df.head(5))
 

Time to clean and tokenize 11314 Headline: 1.3631298224131265 min


Headlines with their respective tokenized versions:
                                            Headline  \
0  Well i'm not sure about the story nad it did s...   
1  \n\n\n\n\n\n\nYeah, do you expect people to re...   
2  Although I realize that principle is not one o...   
3  Notwithstanding all the legitimate fuss about ...   
4  Well, I will have to change the scoring on my ...   

                                  tokenized_headline  
0  [well, im, sure, stori, nad, bias, whati, disa...  
1  [yeah, expect, read, faq, etc, actual, accept,...  
2  [although, realiz, principl, one, strongestpoi...  
3  [notwithstand, legitim, fuss, propos, muchof, ...  
4  [well, chang, score, playoff, pool, unfortunat...  


In [27]:
#LDA
import gensim
import pyLDAvis.gensim

#Create a Gensim dictionary from the tokenized data
tokenized = documents_df['tokenized_headline']
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(tokenized)
#Filter terms which occurs in less than 1 headline and more than 80% of the headlines.
dictionary.filter_extremes(no_below=1, no_above=0.8)
#convert the dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(tokens) for tokens in tokenized]
print(corpus[:1])
print()
print([[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 2), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 4), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 4), (57, 1), (58, 1), (59, 1), (60, 1)]]

[[('act', 1), ('austria', 1), ('away', 1), ('bias', 1), ('blessingreceiv', 1), ('clearli', 1), ('commit', 1), ('daili', 1), ('degre', 1), ('describ', 1), ('disagre', 1), ('europei', 1), ('exist', 1), ('got', 1), ('govern', 1), ('guiltgo', 1), ('holocaust', 1), ('im', 1), ('incid', 1), ('inhuman', 1), ('isra', 1), ('israel', 2), ('isth', 1), ('jew', 1), ('least', 1), ('live', 1), ('look', 1), ('make', 1), ('media', 4), ('might', 1), ('nad', 1), ('occur',

# LDA with 10 topics and 3 words 

In [None]:
#LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
#saving the model
ldamodel.save('model_10_3.gensim')
topics = ldamodel.print_topics(num_words=3)
print('\n')
print("Now printing the topics and their composition")
print("This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic")
for topic in topics:
   print(topic)

# LDA with 5 topics and 5 words

In [None]:
#LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 5, id2word=dictionary, passes=15)
#saving the model
ldamodel.save('model_5_5.gensim')
topics = ldamodel.print_topics(num_words=5)
print('\n')
print("Now printing the topics and their composition")
print("This output shows the Topic-Words matrix for the 7 topics created and the 4 words within each topic")
for topic in topics:
   print(topic)

In [None]:
#finding the similarity of the first review with topics
print('\n')
print("first headline is:")
print(documents_df.Headline[0])
get_document_topics = ldamodel.get_document_topics(corpus[0])
print('\n')
print("The similarity of this review with the topics and respective similarity score are ")
print(get_document_topics)

# visualizing topics for 10 topics and 3 words

In [28]:
lda_viz = gensim.models.ldamodel.LdaModel.load('model_10_3.gensim')
lda_display = pyLDAvis.gensim.prepare(lda_viz, corpus, dictionary, sort_topics=True)
pyLDAvis.enable_notebook()
pyLDAvis.display(lda_display)

# visualizing topics for 5 topics and 5 words

In [29]:
lda_viz = gensim.models.ldamodel.LdaModel.load('model_5_5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda_viz, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

# Observations
1. More words are included in each topic when less number of topics are used.
2. Diverse distribution of words is considered in case of less number of topics
3. number of topics and number of words are hyperparameters that needs to be experimented with and adjusted as seems fit.
4. As the number of topics changes, the distribution of words is affected in the same corpus
5. As the nubmer of words increases, the dive