# Topic Modelling of BBC Articles

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import os

import nltk
import gensim
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from string import punctuation
import re
import nltk
from nltk.corpus import stopwords
from gensim.models import LsiModel, LdaModel, CoherenceModel

os.chdir(r'/Users/rasheshkothari/Desktop/Text Analytics/Assignment 3/')

In [4]:
bbc_articles_data = pd.read_csv('BBC-articles.csv')
bbc_articles_data.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


### Text Vectorization and Model Preparation

In [3]:
stopwords = stopwords.words('english')
articles_list = bbc_articles_data['text'].tolist()

# Cleaned the text of punctuations, stopwords and special characters
# Lemmatized the words
def cleaned_text(text):
    text = text.strip(punctuation).lower()
    text = re.sub(r'[!?,.\:;\n\t]+', '', text)
    
    word= nltk.tokenize.word_tokenize(text)
    word = [w for w in word if w.isalpha()]
    word = [w for w in word if w not in stopwords and len(w) > 2]

    wordnet = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [wordnet.lemmatize(w) for w in word]
    return lemmatized_words

##### After normal cleaning of the text corpus

In [4]:
tokenized_words = []
for article in articles_list:
    tokenized_words.append(cleaned_text(article))

    

dictionary1 = Dictionary(tokenized_words)  
print(dictionary1)
dictionary1.token2id 
len(dictionary1.token2id)
dtm1 = [dictionary1.doc2bow(doc) for doc in tokenized_words]

tfidf1 = TfidfModel(dtm1)
tfidf1 = tfidf1[dtm1]

Dictionary(24212 unique tokens: ['abiding', 'according', 'adam', 'added', 'advert']...)


In [5]:
lsi_model1 = LsiModel(tfidf1, id2word = dictionary1, num_topics = 5)

In [6]:
lda_model1 = LdaModel(tfidf1, id2word = dictionary1, num_topics = 5)

##### With term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents (drawing from Zipf's Law)

In [7]:
dictionary2 = dictionary1

# Filtered the extreme words
dictionary2.filter_extremes(no_below = 5, no_above = 0.90)
dtm2 = [dictionary2.doc2bow(doc) for doc in tokenized_words]
tfidf2 = TfidfModel(dtm2)
tfidf2 = tfidf2[dtm2]

In [8]:
lsi_model2 = LsiModel(tfidf2, id2word = dictionary2, num_topics = 5)
lda_model2 = LdaModel(tfidf2, id2word = dictionary2, num_topics = 5)

##### With a part of speech filter, to limit your TD-IDF matrix to nouns only

In [9]:
tokenized_nouns = []

# Iterated over a for loop to populate a list of nouns
for i in tokenized_words:
    words_pos = nltk.pos_tag(i)
    list_of_nouns = [w for w, p in words_pos if p == 'NN']
    tokenized_nouns.append(list_of_nouns)

dictionary3 = Dictionary(tokenized_nouns)

dtm3 = [dictionary3.doc2bow(doc) for doc in tokenized_nouns]
tfidf3 = TfidfModel(dtm3)
tfidf3 = tfidf3[dtm3]

In [10]:
lsi_model3 = LsiModel(tfidf3, id2word = dictionary3, num_topics = 5)
lda_model3 = LdaModel(tfidf3, id2word = dictionary3, num_topics = 5)

In [11]:
# Get most frequent keywords for each article

def getMostFrequentKeywords(model, corpus, texts): 
    topic_keywords_list = []
    
    # Got the main topic of each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        
        # Got the frequently used keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  
                try:
                    word_prop = model.show_topic(topic_num)
                    topic_keywords = ", ".join([word for word, prop in word_prop[:5]])
                    topic_keywords_list.append(topic_keywords)
                except:
                    continue
            else:
                break
    return(pd.Series(topic_keywords_list))

# Assigned the keywords for each vectorization and model combination
bbc_articles_data['LSIModelKeywords1'] = getMostFrequentKeywords(model=lsi_model1, corpus=tfidf1, texts=bbc_articles_data.text)
bbc_articles_data['LDAModelKeywords1'] = getMostFrequentKeywords(model=lda_model1, corpus=tfidf1, texts=bbc_articles_data.text)
bbc_articles_data.head()

Unnamed: 0,category,text,LSIModelKeywords1,LDAModelKeywords1
0,tech,tv future in the hands of viewers with home th...,"older, mitsubishi, trial, scooped, affordable","family, position, simple, older, slow"
1,business,worldcom boss left books alone former worldc...,"york, warned, motor, option, vulnerability","family, position, simple, older, slow"
2,sport,tigers wary of farrell gamble leicester say ...,"york, warned, motor, option, vulnerability","pig, level, fourth, budget, something"
3,sport,yeading face newcastle in fa cup premiership s...,"older, trial, york, warned, motor","pig, level, fourth, budget, something"
4,entertainment,ocean s twelve raids box office ocean s twelve...,"budget, something, pig, fourth, plan","pig, level, fourth, budget, something"


In [12]:
bbc_articles_data['LSIModelKeywords2'] = getMostFrequentKeywords(model=lsi_model2, corpus=tfidf2, texts=bbc_articles_data.text)
bbc_articles_data['LDAModelKeywords2'] = getMostFrequentKeywords(model=lda_model2, corpus=tfidf2, texts=bbc_articles_data.text)

In [13]:
bbc_articles_data['LSIModelKeywords3'] = getMostFrequentKeywords(model=lsi_model3, corpus=tfidf3, texts=bbc_articles_data.text)
bbc_articles_data['LDAModelKeywords3'] = getMostFrequentKeywords(model=lda_model3, corpus=tfidf3, texts=bbc_articles_data.text)

In [14]:
# Combined all the keywords
bbc_articles_data['AllKeywords'] = bbc_articles_data['LSIModelKeywords1'] + ', ' + bbc_articles_data['LSIModelKeywords2'] + ', ' + bbc_articles_data['LSIModelKeywords3'] + ', ' + bbc_articles_data['LDAModelKeywords1'].fillna(method='ffill') + ', ' + bbc_articles_data['LDAModelKeywords2'] + ', ' + bbc_articles_data['LDAModelKeywords3']

In [16]:
# Calculated 5 most common keywords across the four groups of keywords
from collections import Counter 
for i in bbc_articles_data.index:
    keywords = bbc_articles_data.loc[i, 'AllKeywords']
    keywords = keywords.split(',')
    most_occur = Counter(keywords).most_common(5) 
    bbc_articles_data.loc[i, 'Top5FreqWords'] = ','.join([word[0] for word in most_occur])

bbc_articles_data[['text', 'Top5FreqWords']].head(3)

Unnamed: 0,text,Top5FreqWords
0,tv future in the hands of viewers with home th...,"phone, mobile, party, blair, game"
1,worldcom boss left books alone former worldc...,"film,york, warned, motor, option"
2,tigers wary of farrell gamble leicester say ...,"mobile, phone, economy, film, game"


In [20]:
bbc_articles_data = bbc_articles_data.drop(['AllKeywords'], axis=1)
bbc_articles_data.head(5)

Unnamed: 0,category,text,LSIModelKeywords1,LDAModelKeywords1,LSIModelKeywords2,LDAModelKeywords2,LSIModelKeywords3,LDAModelKeywords3,Top5FreqWords
0,tech,tv future in the hands of viewers with home th...,"older, mitsubishi, trial, scooped, affordable","family, position, simple, older, slow","mobile, phone, film, award, best","blair, party, phone, game, mobile","election, tax, party, blair, government","phone, technology, music, software, game","phone, mobile, party, blair, game"
1,business,worldcom boss left books alone former worldc...,"york, warned, motor, option, vulnerability","family, position, simple, older, slow","mobile, phone, film, award, best","virus, woodward, player, film, lion","election, tax, party, blair, government","film, search, bank, dollar, price","film,york, warned, motor, option"
2,sport,tigers wary of farrell gamble leicester say ...,"york, warned, motor, option, vulnerability","pig, level, fourth, budget, something","mobile, phone, economy, growth, film","blair, party, phone, game, mobile","film, game, england, award, oscar","blair, sale, tax, party, economy","mobile, phone, economy, film, game"
3,sport,yeading face newcastle in fa cup premiership s...,"older, trial, york, warned, motor","pig, level, fourth, budget, something","mobile, phone, economy, growth, film","blair, party, phone, game, mobile","film, game, england, award, oscar","blog, blair, domain, party, election","mobile, phone, film, game, blair"
4,entertainment,ocean s twelve raids box office ocean s twelve...,"budget, something, pig, fourth, plan","pig, level, fourth, budget, something","film, award, england, best, oscar","blair, party, phone, game, mobile","election, tax, party, blair, government","blair, sale, tax, party, economy","party, blair, something, pig, fourth"


In [21]:
# Exported the populated corpus to a csv
bbc_articles_data.to_csv('BBC_News_Keywords.csv',index=False,encoding='utf-8')

We used 6 different combinations of TF-IDF vectorization techniques using algorithms(LSI and LDA). From the results we can observe that, LDA algorithm with TF-IDF vectorization works better on the dataset as the keywords from the most dominant topic are more relevant and descriptive for each article. 

Below are the 5 most frequent words in each category
LSI with TF-IDF vetorization (normal cleaning): older, mitsubishi, trial, scooped, affordable.
Very few keywords are related to technology.

LDA with TF-IDF Vectorization (normal cleaning): family, position, simple, older, slow.
Few keywords are related to technolgy.

LSI with TF-IDF vectorization (term frequency filter): mobile, phone, film, award, best. Very few keywords are related to technology

LDA with TF-IDF vectorization (term frequency filter): blair, party, phone, game, mobile. Very few keywords are related to technology.

LSI with TF-IDF vectorization (part of speech filter): election, tax, party, blair, government. Very few keywords are related to technology

LDA with TF-IDF vectorization (part of speech filter): phone, technology, music, software, game. Very few keywords are related to technology.

We can clearly observe that LDA with TF-IDF vetorization in normal cleaning method performs the best amogst all the combinations.