In [1]:
import numpy as np
import pandas as pd
import string
import logging
#import pyLDAvis.gensim
import json
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity
import gensim
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
import re

In [2]:
from packages import *
incident_ci_data,incident_data=create_incident_data()
# problems_ci_data,problems_data=create_problem_data()
# change_ci_data,change_data=create_change_data()
# incident_ci_problem_mapped_df,incident_problem_mapped_df=create_incident_problem_data()
sr_ci_data=create_sr_data()

In [3]:
df=pd.DataFrame(incident_ci_data[incident_ci_data['Incident CI Name']=='IMDM-Equity-MDM']['Incident Short Description'],columns=['Incident Short Description']).drop_duplicates()

In [4]:
df_copy = df.copy()

In [5]:
df_copy['Incident Short Description'].nunique()

4047

In [6]:
df.columns

Index(['Incident Short Description'], dtype='object')

In [7]:
def remove_punctuation(text):
    new_text=''.join([char for char in text if char not in string.punctuation])
    return new_text

In [8]:
#punctuation removing
df['Incident Short Description']=df['Incident Short Description'].apply(lambda row : remove_punctuation(row))

In [9]:
df.head()

Unnamed: 0,Incident Short Description
19,DBI Dropped IDs
30,Europe Corporate Action IPO Effective 01st Ja...
62,US Corporate Action IPO Effective 1st Jan 2020
64,Financial Calendar New Holiday Centres
75,OTHERTracerReplications alert


In [10]:
df['Incident Short Description']  = df['Incident Short Description'] .map(lambda x: x.lower())
#Remove digits
df['Incident Short Description'] = df['Incident Short Description'].str.replace('\d+', '')
#Remove one and two letter words
df['Incident Short Description'] = df['Incident Short Description'].str.replace(r'\b(\w{1,2})\b', '')

In [11]:
df.head()

Unnamed: 0,Incident Short Description
19,dbi dropped ids
30,europe corporate action ipo effective jan
62,corporate action ipo effective jan
64,financial calendar new holiday centres
75,othertracerreplications alert


In [12]:
import gensim
from gensim.utils import simple_preprocess

In [13]:
#tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = df['Incident Short Description'].values.tolist()
data_words = list(sent_to_words(data))
print(data_words[0:5])

[['dbi', 'dropped', 'ids'], ['europe', 'corporate', 'action', 'ipo', 'effective', 'jan'], ['corporate', 'action', 'ipo', 'effective', 'jan'], ['financial', 'calendar', 'new', 'holiday', 'centres'], ['alert']]


In [14]:
#df['final_text'] = df.final_text.str.replace(r'\b(\w{1,2})\b', '')

In [15]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [17]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)


In [18]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#!pip install spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])


In [19]:
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [20]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]



In [None]:
# Considering 1-30 topics, as the last is cut off
num_topics = list(range(30)[1:])
num_keywords = 15

LDA_models = {}
LDA_topics = {}
for i in num_topics:
    LDA_models[i] = LdaModel(corpus=corpus,
                             id2word=id2word,
                             num_topics=i,
                             update_every=1,
                             chunksize=len(corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    shown_topics = LDA_models[i].show_topics(num_topics=i, 
                                             num_words=num_keywords,
                                             formatted=False)
    LDA_topics[i] = [[word[0] for word in topic[1]] for topic in shown_topics]


In [None]:
def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))


In [None]:
LDA_stability = {}
for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            sims.append(jaccard_similarity(topic1, topic2))    
        
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
coherences = [CoherenceModel(model=LDA_models[i], texts=data_lemmatized, dictionary=id2word, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

In [None]:
coh_sta_diffs = [coherences[i] - mean_stabilities[i] for i in range(num_keywords)[:-1]] # limit topic numbers to the number of keywords
coh_sta_max = max(coh_sta_diffs)
coh_sta_max_idxs = [i for i, j in enumerate(coh_sta_diffs) if j == coh_sta_max]
ideal_topic_num_index = coh_sta_max_idxs[0] # choose less topics in case there's more than one max
ideal_topic_num = num_topics[ideal_topic_num_index]


In [None]:
plt.figure(figsize=(20,10))
ax = sns.lineplot(x=num_topics[:-1], y=mean_stabilities, label='Average Topic Overlap')
ax = sns.lineplot(x=num_topics[:-1], y=coherences, label='Topic Coherence')

ax.axvline(x=ideal_topic_num, label='Ideal Number of Topics', color='black')
ax.axvspan(xmin=ideal_topic_num - 1, xmax=ideal_topic_num + 1, alpha=0.5, facecolor='grey')

y_max = max(max(mean_stabilities), max(coherences)) + (0.10 * max(max(mean_stabilities), max(coherences)))
ax.set_ylim([0, y_max])
ax.set_xlim([1, num_topics[-1]-1])
                
ax.axes.set_title('Model Metrics per Number of Topics', fontsize=25)
ax.set_ylabel('Metric Level', fontsize=20)
ax.set_xlabel('Number of Topics', fontsize=20)
plt.legend(fontsize=20)
plt.show()   

In [None]:
#Your ideal number of topics will maximize coherence and minimize the topic overlap based on Jaccard similarity. 
#In this case it looks like we'd be safe choosing topic numbers around ideal_topic_num.

ideal_topic_num

In [None]:
#num_topics = ideal_topic_num



LDA_models_final = LdaModel(corpus=corpus,
                             id2word=id2word,
                             num_topics=ideal_topic_num,
                             update_every=1,
                             chunksize=len(corpus),
                             passes=20,
                             alpha='auto',
                             random_state=42)

    

In [None]:
shown_topics_final = LDA_models_final.show_topics(num_topics=ideal_topic_num,num_words=15,formatted=False)

In [None]:
LDA_topics_final = [[word[0] for word in topic[1]] for topic in shown_topics]

In [None]:
df['main_topic'] = [int(str(sorted(LDA_models_final[i],reverse=True,key=lambda x: x[1])[0][0]).zfill(3)) for i in corpus]


In [None]:
tn =[]
tnames=[]
for i in range(len(shown_topics_final)):
    tn.append(shown_topics_final[i][0])
    tnames.append(shown_topics_final[i][1])

In [None]:
df.head()

In [None]:
df_tnames = pd.DataFrame(list(zip(tn, tnames)),columns=['topic_num','topic_names'])
df_tnames['topic_names'] = df_tnames['topic_names'].astype('str')
df_tnames['topic_names']=df_tnames['topic_names'].apply(lambda row: remove_punctuation(row))
df_tnames['topic_names']=df_tnames['topic_names'].str.replace('\d+', '')

In [None]:
df_final = df.merge(df_tnames,left_on='main_topic',right_on='topic_num',how='inner')

In [None]:
df_final.drop('main_topic',axis=1,inplace=True)

In [None]:
df_final.to_csv('IMDM_Topics.csv')

In [None]:
df_final['Incident Short Description'].value_counts()