In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Libraries

In [None]:
import pandas 
import pickle
import nltk

import pyLDAvis
import pyLDAvis.gensim
import gensim
import gensim.corpora as corpora
from gensim.models.ldamodel import LdaModel

from gensim.models import CoherenceModel
import spacy
from pprint import pprint

import seaborn as sns
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess

nltk.download('stopwords')


In [None]:
%cd /kaggle/working
from IPython.display import FileLink
FileLink(r'topic_wise_pandas_related_sentences.csv')

# Preprocessing

In [None]:

def process_words(texts, stop_words,nlp,bigram_mod, trigram_mod, allowed_tags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Convert a document into a list of lowercase tokens, build bigrams-trigrams, implement lemmatization"""

    # remove stopwords, short tokens and letter accents
    texts = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc in
             texts]

    # bi-gram and tri-gram implementation
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]

    texts_out = []

    # implement lemmatization and filter out unwanted part of speech tags
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_tags])

    # remove stopwords and short tokens again after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc), deacc=True, min_len=3) if word not in stop_words] for doc
                 in texts_out]

    return texts_out

In [None]:
df = pandas.read_csv('../input/tosem-iot-security/pandas.csv')

data = list(df.sentence)



In [None]:


####### bi & tri-gram #############################################################
bigram = gensim.models.Phrases(data, min_count=20, threshold=100)
trigram = gensim.models.Phrases(bigram[data], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


Data Preprocessing

In [None]:

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
stop_words = nltk.corpus.stopwords.words('english')

data_ready = process_words(data,stop_words,nlp,bigram_mod,trigram_mod)
id2word = corpora.Dictionary(data_ready)
print('Total Vocabulary Size:', len(id2word))


Frequency of Words

In [None]:
corpus = [id2word.doc2bow(text) for text in data_ready]
dict_corpus = {}

for i in range(len(corpus)):
    for idx, freq in corpus[i]:
        if id2word[idx] in dict_corpus:
            dict_corpus[id2word[idx]] += freq
        else:
            dict_corpus[id2word[idx]] = freq

dict_df = pandas.DataFrame.from_dict(dict_corpus, orient='index', columns=['freq'])



Plot

In [None]:

plt.figure(figsize=(8, 6))
sns.distplot(dict_df['freq'], bins=100)


Filtering

In [None]:

dict_df.sort_values('freq', ascending = False).head(10)
extension = dict_df[dict_df.freq > 1500].index.tolist()

ids = [id2word.token2id[extension[i]] for i in range(len(extension))]
id2word.filter_tokens(bad_ids=ids)



Most frequently used words removal

In [None]:
# add high frequency words to stop words list
stop_words.extend(extension)
# rerun the process_words function
data_ready = process_words(data,stop_words,nlp,bigram_mod,trigram_mod)
# recreate Dictionary
id2word = corpora.Dictionary(data_ready)
print('Total Vocabulary Size:', len(id2word))





Extreme word removal

In [None]:


##extreme words

id2word.filter_extremes(no_below=10, no_above=.5)
print('Total Vocabulary Size:', len(id2word))






Final Vocabulary

In [None]:
corpus = [id2word.doc2bow(text) for text in data_ready]

# Mallet

In [None]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

In [None]:
!unzip mallet-2.0.8.zip

In [None]:
mallet_path = 'mallet-2.0.8/bin/mallet'

In [None]:
ls __notebook_source__.ipynb

# Grid Searching

In [None]:
def coherence_values_computation(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(
             mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(
              model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
model_list, coherence_values = coherence_values_computation (
   dictionary=id2word, corpus=corpus, texts=data_ready, 
   start=10, limit=13, step=1)
limit=13; start=10; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " is having Coherence Value of", round(cv, 4))

12 with .4296

# Model Selection

In [None]:
ldamallet = gensim.models.wrappers.LdaMallet(
             mallet_path, corpus=corpus, num_topics=12, id2word=id2word)

In [None]:
coherencemodel = CoherenceModel(
              model=ldamallet, texts=data_ready, dictionary=id2word, coherence='c_v')

In [None]:
# ldamallet = model_list[2]

In [None]:
new_model_topics = ldamallet.show_topics(formatted=False)
pprint(ldamallet.print_topics())

In [None]:
from pprint import pprint
# display topics
pprint(ldamallet.show_topics(formatted=False))

# Store Model

In [None]:
import pickle
pickle.dump(ldamallet, open("ldamallet_12.p", "wb"))

In [None]:
ldamallet = pickle.load(open("./ldamallet_12.p", "rb"))

In [None]:
tm_results = ldamallet[corpus]

In [None]:
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0] for topics in tm_results]

In [None]:
topics = [[(term, round(wt, 3)) for term, wt in ldamallet.show_topic(n, topn=20)] for n in range(0, ldamallet.num_topics)]

# Topics

In [None]:
topics_df = pandas.DataFrame([[term for term, wt in topic] for topic in topics], columns = ['Term'+str(i) for i in range(1, 21)], index=['Topic '+str(t) for t in range(1, ldamallet.num_topics+1)]).T
topics_df.head()

In [None]:
# set column width
pandas.set_option('display.max_colwidth', -1)
topics_df = pandas.DataFrame([', '.join([term for term, wt in topic]) for topic in topics], columns = ['Terms per Topic'], index=['Topic'+str(t) for t in range(1, ldamallet.num_topics+1)] )
topics_df

# Gensim Conversion

In [None]:
def convertldaGenToldaMallet(mallet_model):
    model_gensim = LdaModel(
        id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
        alpha=mallet_model.alpha, eta=0,
    )
    model_gensim.state.sstats[...] = mallet_model.wordtopics
    model_gensim.sync_state()
    return model_gensim

In [None]:
ldagensim = convertldaGenToldaMallet(ldamallet)

In [None]:
import pyLDAvis.gensim as gensimvis
vis_data = gensimvis.prepare(ldagensim, corpus, id2word, sort_topics=False)
pyLDAvis.display(vis_data)

In [None]:
# create a dataframe
corpus_topic_df = pandas.DataFrame()
# get the Titles from the original dataframe
# corpus_topic_df['SentenceId'] = df.SentenceId
corpus_topic_df['Dominant Topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['Contribution %'] = [round(item[1]*100, 2) for item in corpus_topics]
corpus_topic_df['Topic Terms'] = [topics_df.iloc[t[0]]['Terms per Topic'] for t in corpus_topics]
corpus_topic_df.head()

In [None]:
corpus_topic_df.groupby('Dominant Topic').apply(lambda topic_set: (topic_set.sort_values(by=['Contribution %'], ascending=False).iloc[0])).reset_index(drop=True)

In [None]:
pprint(tm_results[0])

In [None]:
df_weights = pandas.DataFrame.from_records([{v: k for v, k in row} for row in tm_results])
df_weights.columns = ['Topic ' + str(i) for i in range(1,13)]
df_weights

# Topic Labeling

In [None]:
topic = list(df_weights.idxmax(axis=1))

In [None]:

df['Topics'] = topic

In [None]:
score = list(df_weights.max(axis=1))

In [None]:
df['Correlation Score'] = score

In [None]:
df.to_csv('topic_wise_pandas_related_sentences.csv')

In [None]:
df.to_excel('topic_wise_pandas_related_sentences.xlsx')

In [None]:
df.groupby('Topics')['sentence'].nunique()