In [52]:
import os

import numpy as np
import pandas as pd

import re
import string

# NLTK Stop words
import nltk
from nltk.corpus import stopwords
from nltk import TweetTokenizer
from nltk.stem import WordNetLemmatizer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plots
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline


In [4]:
path = '\zone\text_mining\document_clustering'
fileList = os.listdir(path)
for i in fileList:
    file = open(os.path.join(path+'/'+ i), 'r')
data = file.readlines()

data = [re.sub('\s+',' ', sent) for sent in data]

data = [re.sub('\'', '', sent) for sent in data]

data = [x for x in data if x != ' ']
print(data)

  data = [re.sub('\s+',' ', sent) for sent in data]


['When organizations decide to shift their workloads, data and processes across multiple on-premises, hosted, private, and public cloud services, there will be a need for a new approach. This new approach leads to hybrid multi-cloud cloud management. But this approach requires uniform solutions in terms of billing and provisioning, access control, cost control, and performance analysis and capacity management. ', 'A hybrid multi-cloud architecture is emerging within nearly all enterprises. IT organizations are no longer limited to managing data-centers and a few hosted and managed services providers. Needy lines-of-business teams and impatient IT developers have procured SaaS, IaaS, and PaaS cloud services to overcome resource constraints. Now many enterprises IT structures are composed of multi-clouds. ', 'In the IT industry, the tools and technologies needed to craft and manage hybrid multi-clouds architecture are fragmented. Multi-clouds and hybrid clouds bring workload and infrastr

In [131]:
val = ','.join(data)
stopwords_punct = set(stopwords.words('english')).union(string.punctuation).union('-')

sentences = nltk.sent_tokenize(val)
sents_stopwords_rm = []
for sent in sentences:
    sents_stopwords_rm.append(' '.join(w for w in nltk.word_tokenize(sent) if w.lower() not in stopwords_punct))
        
data_tokens_no_stopwords = [nltk.word_tokenize(t) for t in sents_stopwords_rm]
data_tokens_no_stopwords

[['organizations',
  'decide',
  'shift',
  'workloads',
  'data',
  'processes',
  'across',
  'multiple',
  'on-premises',
  'hosted',
  'private',
  'public',
  'cloud',
  'services',
  'need',
  'new',
  'approach'],
 ['new', 'approach', 'leads', 'hybrid', 'multi-cloud', 'cloud', 'management'],
 ['approach',
  'requires',
  'uniform',
  'solutions',
  'terms',
  'billing',
  'provisioning',
  'access',
  'control',
  'cost',
  'control',
  'performance',
  'analysis',
  'capacity',
  'management'],
 ['hybrid',
  'multi-cloud',
  'architecture',
  'emerging',
  'within',
  'nearly',
  'enterprises'],
 ['organizations',
  'longer',
  'limited',
  'managing',
  'data-centers',
  'hosted',
  'managed',
  'services',
  'providers'],
 ['Needy',
  'lines-of-business',
  'teams',
  'impatient',
  'developers',
  'procured',
  'SaaS',
  'IaaS',
  'PaaS',
  'cloud',
  'services',
  'overcome',
  'resource',
  'constraints'],
 ['many', 'enterprises', 'structures', 'composed', 'multi-clouds'],

In [117]:
wordnet_lemmatizer = WordNetLemmatizer()
data_lemmatized = []
for w in data_tokens_no_stopwords:
 data_lemmatized.append([word for word in map(wordnet_lemmatizer.lemmatize, w)])
data_lemmatized

[['organization',
  'decide',
  'shift',
  'workload',
  'data',
  'process',
  'across',
  'multiple',
  'on-premises',
  'hosted',
  'private',
  'public',
  'cloud',
  'service',
  'need',
  'new',
  'approach'],
 ['new', 'approach', 'lead', 'hybrid', 'multi-cloud', 'cloud', 'management'],
 ['approach',
  'requires',
  'uniform',
  'solution',
  'term',
  'billing',
  'provisioning',
  'access',
  'control',
  'cost',
  'control',
  'performance',
  'analysis',
  'capacity',
  'management'],
 ['hybrid',
  'multi-cloud',
  'architecture',
  'emerging',
  'within',
  'nearly',
  'enterprise'],
 ['organization',
  'longer',
  'limited',
  'managing',
  'data-centers',
  'hosted',
  'managed',
  'service',
  'provider'],
 ['Needy',
  'lines-of-business',
  'team',
  'impatient',
  'developer',
  'procured',
  'SaaS',
  'IaaS',
  'PaaS',
  'cloud',
  'service',
  'overcome',
  'resource',
  'constraint'],
 ['many', 'enterprise', 'structure', 'composed', 'multi-clouds'],
 ['industry',
  '

In [134]:
# Create Corpus
dictionary = corpora.Dictionary(data_lemmatized)
corpus = [dictionary.doc2bow(text) for text in data_lemmatized]
print(corpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)], [(1, 1), (2, 1), (8, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [(1, 1), (19, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1)], [(17, 1), (20, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1)], [(5, 1), (10, 1), (14, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)], [(2, 1), (14, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1)], [(35, 1), (56, 1), (57, 1), (58, 1), (59, 1)], [(17, 1), (33, 1), (58, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)], [(2, 2), (8, 1), (16, 1), (17, 1), (19, 1), (65, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 1)], [(2, 3), (8, 1), (9, 1), (11, 1), (12, 1), (13, 1), (14, 1), (17, 1), (19, 1), (26, 1), (27, 1), (54, 1), (63, 1), (73, 1), (74, 1), (75, 1), (7

In [122]:
# Validate the term frequency
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('across', 1),
  ('approach', 1),
  ('cloud', 1),
  ('data', 1),
  ('decide', 1),
  ('hosted', 1),
  ('multiple', 1),
  ('need', 1),
  ('new', 1),
  ('on-premises', 1),
  ('organization', 1),
  ('private', 1),
  ('process', 1),
  ('public', 1),
  ('service', 1),
  ('shift', 1),
  ('workload', 1)]]

In [136]:
# Build LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=5, 
                                           passes=10,
                                           per_word_topics=True)

In [137]:
# Print the topics and keywords in each topic
ldamodel.print_topics()

[(0,
  '0.037*"on-premise" + 0.032*"deployed" + 0.032*"application" + 0.027*"cloud" + 0.022*"data" + 0.022*"AC1" + 0.021*"component" + 0.017*"center" + 0.017*"portfolio" + 0.017*"re-architected"'),
 (1,
  '0.038*"cloud" + 0.023*"architecture" + 0.023*"environment" + 0.023*"data" + 0.023*"hosted" + 0.016*"organization" + 0.016*"new" + 0.016*"workload" + 0.016*"service" + 0.016*"technology"'),
 (2,
  '0.015*"enterprise" + 0.015*"multi-clouds" + 0.015*"healthy" + 0.015*"routed" + 0.015*"available" + 0.015*"traffic" + 0.015*"another" + 0.015*"many" + 0.015*"instance" + 0.015*"composed"'),
 (3,
  '0.028*"architecture" + 0.021*"hybrid" + 0.021*"cost" + 0.021*"performance" + 0.021*"cloud" + 0.015*"environment" + 0.015*"control" + 0.015*"management" + 0.015*"service" + 0.015*"multi-cloud"'),
 (4,
  '0.038*"multi-cloud" + 0.024*"service" + 0.024*"hybrid" + 0.024*"architecture" + 0.018*"migration" + 0.017*"component" + 0.016*"cloud" + 0.016*"approach" + 0.010*"AC2" + 0.009*"two"')]

In [133]:
print('Perplexity: ', ldamodel.log_perplexity(corpus))

Perplexity:  -5.91028138708741


In [129]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, id2word)
vis

In [None]:

pyLDAvis.save_html(vis, '\zone\text_mining\document_clustering\lda.html')

In [127]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3.0,0.9864,"cloud, service, architecture, application, dat...",When organizations decide to shift their workl...
1,1,0.0,0.9641,"cloud, hybrid, multi-cloud, architecture, serv...",A hybrid multi-cloud architecture is emerging ...
2,2,1.0,0.9832,"center, data, architecture, environment, multi...","In the IT industry, the tools and technologies..."
3,3,4.0,0.9647,"on-premise, AC1, architecture, deployed, two, ...",Organizations plan to migrate their on-premise...
4,4,1.0,0.9723,"center, data, architecture, environment, multi...",Users are widely distributed geographically wh...
5,5,3.0,0.9835,"cloud, service, architecture, application, dat...",Facing regulations limit in particular countri...
6,6,2.0,0.9512,"multi-cloud, on-premise, hybrid, migration, tw...",An environment where public clouds are used wi...
7,7,3.0,0.9772,"cloud, service, architecture, application, dat...","A cloud-based application is not resilient, wh..."
8,8,0.0,0.9803,"cloud, hybrid, multi-cloud, architecture, serv...","According to the above challenges, Iâ€™ve intr..."
9,9,3.0,0.9931,"cloud, service, architecture, application, dat...",1. Multi-Application Rebinding
