In [11]:
# find the number of topics in the given corpus

import os
import gensim
from gensim.models import LsiModel
from gensim import models
from gensim import corpora
from gensim.utils import lemmatize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import remove_stopwords, stem_text
from gensim.parsing.preprocessing import strip_numeric, strip_short,strip_multiple_whitespaces,strip_non_alphanum,strip_punctuation,strip_tags,preprocess_string
import pandas as pd
from gensim import similarities
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt
from pprint import pprint



#read the data
corpus_dir = 'https://raw.githubusercontent.com/Ramaseshanr/anlp/master/corpus/bbc-text.csv'
df_corpus = pd.read_csv(corpus_dir,names=['category', 'text'])
corpus = df_corpus['text'].values.tolist()
corpus = corpus[1:]
my_filter = [
    lambda x: x.lower(), strip_tags, strip_punctuation,
    strip_multiple_whitespaces, strip_numeric,
    remove_stopwords, strip_short, stem_text
]


def preprocessing(corpus):

    for document in corpus:
        doc = strip_numeric(document)
        doc = remove_stopwords(doc)
        doc = strip_short(doc,3)
        doc = stem_text(doc)
        doc = strip_punctuation(doc)
        strip_tags(doc)
        yield gensim.utils.tokenize(doc, lower=True)


texts = preprocessing(corpus)
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, keep_n=25000)

doc_term_matrix = [dictionary.doc2bow(tokens) for tokens in preprocessing(corpus)]
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary)  # initialize an LSI transformation
pprint(lsi.print_topics(num_topics=5, num_words=10)) #num words in each topic


# lot of words in the corpus has very less freq that's why we are unable to find the pattern
# which results into large value of sigma(n).
# the smaller the sigma(n) the better the result

# also there are a lot of words common in the both the topics like sports and film

[(0,
  '0.126*"labour" + 0.107*"elect" + 0.107*"blair" + 0.101*"brown" + '
  '0.100*"game" + 0.100*"tax" + 0.094*"parti" + 0.094*"film" + 0.090*"tori" + '
  '0.087*"peopl"'),
 (1,
  '0.275*"labour" + 0.228*"blair" + 0.226*"elect" + 0.205*"brown" + '
  '0.195*"tori" + 0.193*"tax" + 0.188*"parti" + -0.158*"film" + -0.139*"game" '
  '+ 0.131*"howard"'),
 (2,
  '0.178*"mobil" + -0.167*"film" + -0.140*"award" + -0.136*"best" + '
  '0.134*"phone" + 0.119*"growth" + -0.114*"win" + -0.112*"england" + '
  '0.102*"bn" + 0.097*"bank"'),
 (3,
  '-0.355*"film" + -0.209*"award" + 0.168*"england" + -0.148*"best" + '
  '-0.139*"oscar" + -0.124*"nomin" + -0.115*"music" + -0.114*"actor" + '
  '0.103*"game" + -0.102*"star"'),
 (4,
  '0.223*"mobil" + -0.184*"film" + 0.178*"phone" + -0.147*"growth" + '
  '-0.143*"economi" + -0.142*"bn" + -0.137*"dollar" + -0.137*"bank" + '
  '-0.130*"rate" + 0.117*"game"')]
