In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import re
import string

import spacy

import gensim
from gensim import corpora
from nltk.stem import PorterStemmer

# libraries for visualization
import pyLDAvis
import PyPDF2
import pyLDAvis.gensim_models as gensimvis
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

  if LooseVersion(mpl.__version__) >= "3.0":
  other = LooseVersion(other)


In [2]:
from sklearn.pipeline import Pipeline
#from sklearn.externals import joblib
import joblib

In [3]:
"""CLEANING"""

def clean_text(t): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation}
    delete_dict[' '] = ' ' 
    txt = ' '.join(map(str,t))
    table = str.maketrans(delete_dict)
    text1 = txt.translate(table)
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    return list(text2.lower().split('|'))


In [4]:
"""REMOVING STOPWORDS"""

def remove_stopwords(t):
    en = spacy.load('en_core_web_sm')
    stpwrds = en.Defaults.stop_words
    all_stopwords = stpwrds.union(stopwords.words('english'))
    finalList = []
    for i in t:
        textArr = i.split(' ')    
        rem_text = ' '.join([i for i in textArr if i not in all_stopwords])
        finalList.append(rem_text)
    return finalList

In [5]:
"""LEMMATIZATION"""

def lemmatize(t):
    nlp = spacy.load('en_core_web_sm', disable=['parser','ner'])
    #text_list = text.tolist()
    splem = []
    wordlist = []
    for i in t:
        wordlist.append(i)
        lst = word_tokenize(i)
        #print(lst)
        #wordlist.append(lst)
        doc = nlp(' '.join(map(str,lst)))
        lem = " ".join([token.lemma_ for token in doc])
        #print(lem)
        splem.append(lem)
    #return(' '.join(map(str,splem)))
    return splem

In [6]:
# """SPLITTING THE LEMMATIZED TEXTS INTO TOKENS"""

# def make_tokens(text):
#     tokens = [d.split() for d in text]
#     return tokens


In [7]:
"""Making the doc_term_matrix"""
def mtrx(t):
    tokens = [d.split() for d in t]
    dictionary = corpora.Dictionary(tokens)
    doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokens]
    return doc_term_matrix


In [8]:
"""MODEL BUILDING"""

def lda_model_build(t,doc_term_matrix):
    tokens = [d.split() for d in t]
    dictionary = corpora.Dictionary(tokens)
    #doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokens]
    LDA = gensim.models.ldamodel.LdaModel
    lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=10, random_state=100,
                chunksize=1000, passes=50,iterations=100)
    return lda_model
    

In [9]:
def print_topics(lda_model):
    print(lda_model.print_topics())

In [10]:
"""SHOW THE DOMINANT TOPIC WITH THE KEYWORDS"""

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    df_dominant_topic = sent_topics_df.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    return df_dominant_topic


# df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=for_end)

# # Format
# df_dominant_topic = df_topic_sents_keywords.reset_index()
# df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# # Show
# df_dominant_topic.head(10)

In [12]:
pd.set_option('display.max_colwidth', -1)
sentence = "Covid 19 pandemic  has had deep economic repercussions and has triggered  a deep economic crisis It's important to think about three phases as it were of how this crisis is going to play out in the economic domain the first is what began to happen in about February 2020 when we saw China shutdown subsequently with a lot of people cancelling orders for textile exports from India which was kind of a collapse in world trade that would impact both  production and demand domestically so it means that your exports are lower but it also means that you're not able to produce things that require. The consequences of the of the lockdown in order to control the virus sort of put the economy in the freeze and definitely there is going to be a very large impact on production and incomes going forward and we have to deal with it as a multi-step problem because it's not just an impact on the entire economy takes there's layers to it. MSME’s have been in deep problems ever since the monetization so this was basically the final nail in the coffin and unless we take a few steps/measures quickly a lot MSME are simply going to die."
ans = chnge_to_list(sentence)
ans

[(0, '0.016*"order" + 0.016*"nail" + 0.016*"problem" + 0.016*"play" + 0.016*"phase" + 0.016*"people" + 0.016*"pandemic" + 0.016*"production" + 0.016*"monetization" + 0.016*"mean"'), (1, '0.016*"order" + 0.016*"nail" + 0.016*"problem" + 0.016*"play" + 0.016*"phase" + 0.016*"people" + 0.016*"pandemic" + 0.016*"production" + 0.016*"monetization" + 0.016*"mean"'), (2, '0.037*"impact" + 0.037*"economic" + 0.037*"deep" + 0.025*"order" + 0.025*"crisis" + 0.025*"production" + 0.025*"problem" + 0.025*"export" + 0.025*"mean" + 0.025*"economy"'), (3, '0.016*"deep" + 0.016*"economic" + 0.016*"impact" + 0.016*"mean" + 0.016*"production" + 0.016*"economy" + 0.016*"export" + 0.016*"msme" + 0.016*"crisis" + 0.016*"problem"'), (4, '0.016*"order" + 0.016*"nail" + 0.016*"problem" + 0.016*"play" + 0.016*"phase" + 0.016*"people" + 0.016*"pandemic" + 0.016*"production" + 0.016*"monetization" + 0.016*"mean"'), (5, '0.016*"order" + 0.016*"nail" + 0.016*"problem" + 0.016*"play" + 0.016*"phase" + 0.016*"people"

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2,0.9886,"impact, economic, deep, order, crisis, production, problem, export, mean, economy",Covid 19 pandemic has had deep economic repercussions and has triggered a deep economic crisis It's important to think about three phases as it were of how this crisis is going to play out in the economic domain the first is what began to happen in about February 2020 when we saw China shutdown subsequently with a lot of people cancelling orders for textile exports from India which was kind of a collapse in world trade that would impact both production and demand domestically so it means that your exports are lower but it also means that you're not able to produce things that require. The consequences of the of the lockdown in order to control the virus sort of put the economy in the freeze and definitely there is going to be a very large impact on production and incomes going forward and we have to deal with it as a multi-step problem because it's not just an impact on the entire economy takes there's layers to it. MSME’s have been in deep problems ever since the monetization so this was basically the final nail in the coffin and unless we take a few steps/measures quickly a lot MSME are simply going to die.


In [None]:
# a = chnge_to_list(text)
# b = clean_text(a)
# c = remove_stopwords(b)
# d = lemmatize(c)
# e = make_tokens(d)
# f = mtrx(e)
# g = lda_model_build(f)
# h = format_topics_sentences(g)

In [None]:
# pipeline = Pipeline([('list',chnge_to_list(text)),
#                     ('clean',clean_text(chnge_to_list(text))),
#                     ('remove stopwords',remove_stopwords(clean_text(chnge_to_list(text)))),
#                     ('lematize',lemmatize(text)),
#                     ('token',make_tokens(text)),
#                     ('mtrx',mtrx(tokens)),
#                     ('model',lda_model_build(doc_term_matrix)),
#                     ('dominant',format_topics_sentences(ldamodel=lda_model, corpus=doc_term_matrix, texts=for_end))])

In [11]:
def chnge_to_list(text):
    sample_list = list(text.split('|'))
    #for_end = list(text)
    a = clean_text(sample_list)
    b = remove_stopwords(a)
    c = lemmatize(b)
    d = remove_stopwords(c)
    e = mtrx(d)
    f = lda_model_build(d,e)
    g = print_topics(f)
    g = format_topics_sentences(ldamodel=f, corpus=e, texts=sample_list)
    return g