In [None]:
import nltk
import spacy
import os
import re
import pandas as pd
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
#from numpy.distutils.system_info import dfftw_info
from gensim.models import CoherenceModel


In [None]:
from nltk.corpus import stopwords
import glob
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [None]:
path="./Articlespk/*.xlsx" #articlepk has all month article content keyed by their article id
files = glob.glob(path)

In [None]:
all_articles = pd.DataFrame() #all articles with article id
for f in files:
    df = pd.read_excel(f)
    print(len(df))
    all_articles = all_articles.append(df,ignore_index=True, sort=True)
df_list=list(all_articles["english"])
#df_list is list of all article content


In [None]:
all_articles.head()

In [None]:
del df
del files
del path

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [None]:
data_words = list(sent_to_words(df_list))

In [None]:
data_words[5000]

In [None]:
del df_list

In [None]:
%%time
common_terms = ["of", "with", "without", "and", "or", "the", "a"]
bigram = gensim.models.Phrases(data_words, min_count=1, threshold=30, common_terms=common_terms) # higher threshold fewer phrases.


In [None]:
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [None]:
#trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
#bigram_mod = gensim.models.phrases.Phraser(bigram)
#trigram_mod = gensim.models.phrases.Phraser(trigram)

In [None]:
bigram_mod.save("./lda_model_data/bigram_model")

In [None]:
trigram_mod.save("./lda_model_data/trigram_model")
#gensim.models.phrases.Phraser.load("./lda_model_data/trigram_model")

In [None]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
data_words_nostops = remove_stopwords(data_words)


In [None]:
data_words_nostops

In [None]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
#python3 -m spacy download en
#1 time download
nlp = spacy.load('en', disable=['parser', 'ner'])
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [None]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_bigrams)
#save these models

In [None]:
data_words_bigrams[5000]

In [None]:
data_lemmatized = lemmatization(data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [None]:
del bigram
#del bigram_mod
del trigram
#del trigram_mod
del data_words_nostops
#del stop_words

In [None]:
#The two main inputs to the LDA topic model are the dictionary(id2word) and the corpus. Let’s create them.
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized) #id-word link-one list for all files combined
# Create Corpus


In [None]:
texts = data_lemmatized
# Term Document Frequency

corpus = [id2word.doc2bow(text) for text in texts] #list of tuples for each doc- (id,frequency) of word in that document

In [None]:
#del stopwords
del data_words
#del nlp
del texts

In [None]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=15,random_state=100,update_every=1,
chunksize=4000,passes=25,alpha='auto',  per_word_topics= True )


In [None]:
#finding optimal number of topics based on coherentscore
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print(num_topics)

    return model_list, coherence_values

#choose model with highest coherence value

In [None]:
lda_model.save("./lda_model_data/lda15top")
#optimal_model=gensim.models.ldamodel.LdaModel.load("./lda_model_data/lda15top")

In [None]:
#saving id2word dictionary
import pickle
pickle_out = open("./lda_model_data/id2word_dict.pickle","wb")
pickle.dump(id2word, pickle_out)
pickle_out.close()

In [None]:
#saving copus
pickle_out = open("./lda_model_data/corpus_list.pickle","wb")
pickle.dump(corpus, pickle_out)
pickle_out.close()

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)#Compute Coherence Score

In [None]:
model_topics = lda_model.show_topics(formatted=False)
print(lda_model.print_topics(num_words=15))

In [None]:
lda_model.get_document_topics(corpus[1], minimum_probability=0.0)

In [None]:
#for unseen document
words=list(id2word.values()) #all vocabulary

In [None]:
#making vector df
import gensim
optimal_model=gensim.models.ldamodel.LdaModel.load("./lda_model_data/lda15top")
import pickle

In [None]:
file = open("./lda_model_data/corpus_list.pickle", 'rb')

# dump information to that file
corpus = pickle.load(file)

# close the file
file.close()

In [None]:
file = open("./lda_model_data/id2word_dict.pickle", 'rb')

# dump information to that file
id2word = pickle.load(file)

# close the file
file.close()

In [None]:
%%time
vec=[]
for x in corpus:
    vec.append(dict(optimal_model.get_document_topics(x, minimum_probability=0.0)))
 #making vector for all articles.we will merge this vec as dataframe with each article row.   

In [None]:
vec

In [None]:
import glob
import pandas as pd
path="./Articlespk/*.xlsx" #articlepk has all month article content keyed by their article id
files = glob.glob(path)

In [None]:
all_articles = pd.DataFrame() #all articles with article id
for f in files:
    df = pd.read_excel(f)
    print(len(df))
    all_articles = all_articles.append(df,ignore_index=True, sort=True)


In [None]:
df_vec=pd.DataFrame(vec)

In [None]:
len(all_articles)

In [None]:
id_topics.head(89000)

In [None]:
id_topics.isna()

In [None]:
id_topics=all_articles.join(df_vec)

In [None]:
id_topics.to_pickle("id_topics.pickle")

In [None]:
#save bigram_model, trigram_model, corpus, id2word_dict to model for new unseen documents(covered in lda_unseen_file)