#### Topic Modelling
Collect randomly as many as news content from different fields for training. (thru any form, web scrape, pdf, newspaper etc.)
#### Use case
Collect randomly another set of content for test. 
Build topic modelling to understand the topics in the test data.



In [1]:
import nltk
import re
import numpy as np
import pandas as pd
import glob

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

from nltk.stem import WordNetLemmatizer, SnowballStemmer

In [2]:
# Extracting news data
docs_text=[]
file = open('news.txt', mode='rt', encoding='utf-8')
docs_text.append(file.read())
file.close()
print(docs_text)

['Vodafone wins Rs 22,000 cr tax arbitration suit against India; govt liability estimated around Rs 75 cr\nThe Government of India\'s liability will be restricted to about Rs 75 crore -- Rs 30 crore in cost and another Rs 45 crore in tax refund, sources with direct knowledge of the matter said\nBritish telecom giant Vodafone Group plc on Friday won an arbitration against the Indian government over a demand for Rs 22,100 crore in taxes using retrospective legislation.\n\nAn international arbitration tribunal ruled that India\'s demand in past taxes were in breach of fair treatment under a bilateral investment protection pact.\n\n"The award is confidential but Vodafone can confirm that the tribunal has found (it) in Vodafone\'s favour," Vodafone Group said in a statement. "We are studying the lengthy documents and can make no further comment at this time."\n\nIt was not immediately known if the Indian government will abide by the arbitration award.\n\nThe Government of India\'s liability

In [3]:
#creating instance for snowballstemmer
stemmer = SnowballStemmer("english")

# writing function for the entire dataset
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos='v'))

#Tokenize and Lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(lemmatize_stemming(token).lower())
            
    return result

In [4]:
processed_docs = []
for doc in docs_text:
    processed_docs.append(preprocess(doc))
print(processed_docs[:2]) 

[['vodafon', 'win', 'arbitr', 'suit', 'india', 'govt', 'liabil', 'estim', 'govern', 'india', 'liabil', 'restrict', 'crore', 'crore', 'cost', 'crore', 'refund', 'sourc', 'direct', 'knowledg', 'matter', 'say', 'british', 'telecom', 'giant', 'vodafon', 'group', 'friday', 'arbitr', 'indian', 'govern', 'demand', 'crore', 'tax', 'retrospect', 'legisl', 'intern', 'arbitr', 'tribun', 'rule', 'india', 'demand', 'past', 'tax', 'breach', 'fair', 'treatment', 'bilater', 'invest', 'protect', 'pact', 'award', 'confidenti', 'vodafon', 'confirm', 'tribun', 'vodafon', 'favour', 'vodafon', 'group', 'say', 'statement', 'studi', 'lengthi', 'document', 'comment', 'time', 'immedi', 'know', 'indian', 'govern', 'abid', 'arbitr', 'award', 'govern', 'india', 'liabil', 'restrict', 'crore', 'crore', 'cost', 'crore', 'refund', 'sourc', 'direct', 'knowledg', 'matter', 'say', 'vodafon', 'arbitr', 'tribun', 'challeng', 'india', 'usag', 'legisl', 'give', 'power', 'retrospect', 'deal', 'like', 'vodafon', 'billion', 'ac

In [8]:
#bag of words on the data set
dictionary = corpora.Dictionary(processed_docs)

In [9]:
print(dictionary) 

Dictionary(151 unique tokens: ['abid', 'accru', 'acquisit', 'alleg', 'amend']...)


In [10]:
#Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
#words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [11]:
#displaying the corpus saved
print(bow_corpus)

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 6), (7, 1), (8, 1), (9, 1), (10, 2), (11, 2), (12, 2), (13, 1), (14, 1), (15, 1), (16, 2), (17, 2), (18, 1), (19, 1), (20, 4), (21, 1), (22, 1), (23, 5), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 2), (31, 2), (32, 12), (33, 1), (34, 1), (35, 1), (36, 1), (37, 7), (38, 1), (39, 1), (40, 2), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 2), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 2), (60, 1), (61, 1), (62, 5), (63, 1), (64, 2), (65, 1), (66, 3), (67, 1), (68, 1), (69, 3), (70, 1), (71, 10), (72, 6), (73, 3), (74, 2), (75, 2), (76, 1), (77, 2), (78, 1), (79, 1), (80, 3), (81, 1), (82, 5), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 1), (91, 1), (92, 4), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 2), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 

In [13]:
# creating the model for topic modelling
lda_model =  gensim.models.ldamodel.LdaModel(bow_corpus, num_topics =20,id2word = dictionary,passes = 4,random_state=100,
                                           update_every=1,)

In [14]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.009*"vodafon" + 0.008*"crore" + 0.008*"india" + 0.008*"demand" + 0.008*"indian" + 0.007*"arbitr" + 0.007*"challeng" + 0.007*"liabil" + 0.007*"compani" + 0.007*"govern"


Topic: 1 
Words: 0.008*"vodafon" + 0.008*"crore" + 0.008*"demand" + 0.007*"india" + 0.007*"arbitr" + 0.007*"say" + 0.007*"liabil" + 0.007*"compani" + 0.007*"indian" + 0.007*"govern"


Topic: 2 
Words: 0.007*"percent" + 0.007*"power" + 0.007*"pact" + 0.007*"parliament" + 0.007*"pass" + 0.007*"past" + 0.007*"pay" + 0.007*"penalti" + 0.007*"origin" + 0.007*"phone"


Topic: 3 
Words: 0.007*"percent" + 0.007*"power" + 0.007*"pact" + 0.007*"parliament" + 0.007*"pass" + 0.007*"past" + 0.007*"pay" + 0.007*"penalti" + 0.007*"origin" + 0.007*"phone"


Topic: 4 
Words: 0.046*"crore" + 0.037*"vodafon" + 0.026*"india" + 0.022*"indian" + 0.021*"demand" + 0.021*"say" + 0.017*"arbitr" + 0.016*"liabil" + 0.015*"tribun" + 0.014*"compani"


Topic: 5 
Words: 0.009*"vodafon" + 0.009*"crore" + 0.008*"india" + 0.007*"arbit

In [15]:
# Extracting test data 
test_set=[]  

file = open('news_test.txt', mode='rt', encoding='utf-8')
test_set.append(file.read())
file.close()
print(test_set)

['₹20,000 cr retro tax case: Vodafone wins arbitration against India\n2 min read . Updated: 25 Sep 2020, 07:19 PM IST\nEdited By J. Jagannath\nPermanent Court of Arbitration in The Hague holds Indian tax authorities in breach of fair treatment doctrine\n\'Govt to study the arbitration case award in Vodafone International Holding BV,\' says Ministry of Finance\nTopics\nVodafone\nNew Delhi: UK telecom major Vodafone Group Plc on Friday won an international arbitration against India over retrospective tax demand of ₹20,000 crore. The Permanent Court of Arbitration in The Hague ruled that conduct of Income Tax Department is in breach of \'fair and equitable\' treatment.\n\nVodafone was represented at The Hague by DMD Advocates.\n\nThe tribunal ruled that the Indian government\'s imposition of a tax liability on Vodafone is in breach of the investment treaty agreement between India and the Netherlands, Reuters reported while quoting a source.\n\nThe tribunal, in its ruling, said the governm

In [16]:
# processing for the test_set
processed_test=[]
for doc in test_set:
    processed_test.append(preprocess(doc))
print(processed_test[:2])  

[['retro', 'case', 'vodafon', 'win', 'arbitr', 'india', 'read', 'updat', 'edit', 'jagannath', 'perman', 'court', 'arbitr', 'hagu', 'hold', 'indian', 'author', 'breach', 'fair', 'treatment', 'doctrin', 'govt', 'studi', 'arbitr', 'case', 'award', 'vodafon', 'intern', 'hold', 'say', 'ministri', 'financ', 'topic', 'vodafon', 'delhi', 'telecom', 'major', 'vodafon', 'group', 'friday', 'intern', 'arbitr', 'india', 'retrospect', 'demand', 'crore', 'perman', 'court', 'arbitr', 'hagu', 'rule', 'conduct', 'incom', 'depart', 'breach', 'fair', 'equit', 'treatment', 'vodafon', 'repres', 'hagu', 'advoc', 'tribun', 'rule', 'indian', 'govern', 'imposit', 'liabil', 'vodafon', 'breach', 'invest', 'treati', 'agreement', 'india', 'netherland', 'reuter', 'report', 'quot', 'sourc', 'tribun', 'rule', 'say', 'govern', 'ceas', 'seek', 'due', 'vodafon', 'million', 'pound', 'million', 'compani', 'partial', 'compens', 'legal', 'cost', 'sourc', 'say', 'govern', 'studi', 'award', 'aspect', 'care', 'consult', 'counse

In [17]:
for idx, topic in lda_model.print_topics():
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.009*"vodafon" + 0.008*"crore" + 0.008*"india" + 0.008*"demand" + 0.008*"indian" + 0.007*"arbitr" + 0.007*"challeng" + 0.007*"liabil" + 0.007*"compani" + 0.007*"govern"


Topic: 1 
Words: 0.008*"vodafon" + 0.008*"crore" + 0.008*"demand" + 0.007*"india" + 0.007*"arbitr" + 0.007*"say" + 0.007*"liabil" + 0.007*"compani" + 0.007*"indian" + 0.007*"govern"


Topic: 2 
Words: 0.007*"percent" + 0.007*"power" + 0.007*"pact" + 0.007*"parliament" + 0.007*"pass" + 0.007*"past" + 0.007*"pay" + 0.007*"penalti" + 0.007*"origin" + 0.007*"phone"


Topic: 3 
Words: 0.007*"percent" + 0.007*"power" + 0.007*"pact" + 0.007*"parliament" + 0.007*"pass" + 0.007*"past" + 0.007*"pay" + 0.007*"penalti" + 0.007*"origin" + 0.007*"phone"


Topic: 4 
Words: 0.046*"crore" + 0.037*"vodafon" + 0.026*"india" + 0.022*"indian" + 0.021*"demand" + 0.021*"say" + 0.017*"arbitr" + 0.016*"liabil" + 0.015*"tribun" + 0.014*"compani"


Topic: 5 
Words: 0.009*"vodafon" + 0.009*"crore" + 0.008*"india" + 0.007*"arbit