In [73]:
from sklearn.manifold import TSNE
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models import Phrases
from gensim.models import CoherenceModel
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
from copy import deepcopy
import plotly.express as px
import numpy as np
import pandas as pd
import json
import os
import logging
import pytextrank
import spacy
import re
import time
import string

In [2]:
nlp = spacy.load("en_core_web_sm")

In [90]:
df_input_medium = pd.read_csv('../inputs/medium_texts.csv')
df_input_confu = pd.read_csv('../inputs/confu_texts.csv')
df_input_medium['src'] = 'medium.com'
df_input_confu['src'] = 'confluence'
df_input = pd.concat([df_input_medium, df_input_confu])
print(len(df_input))
df_input = df_input[~df_input['text'].isnull()]
print(len(df_input))

FileNotFoundError: [Errno 2] File ../inputs/medium_texts.csv does not exist: '../inputs/medium_texts.csv'

In [75]:
def has_digits(string):

    RE_D = re.compile('\d')
    res = RE_D.search(string)
    return res is not None


def remove_special_chars(data):

    # Remove Emails
    data = re.sub('\S*@\S*\s?', '', data)
    # Remove new line characters
    data = re.sub('\s+', ' ', data)
    # Remove distracting single/double quotes
    data = re.sub("\'", "", data)
    data = re.sub("\“", "", data)
    data = re.sub("\‘", "", data)

    return data



invalid escape sequence \d


invalid escape sequence \S


invalid escape sequence \s


invalid escape sequence \d


invalid escape sequence \S


invalid escape sequence \s


invalid escape sequence \d


invalid escape sequence \S


invalid escape sequence \s


invalid escape sequence \d


invalid escape sequence \S


invalid escape sequence \s



In [76]:
chars = re.escape(string.punctuation)

start = time.time()
unigram_docs = []  # This will be the list of lemmatised abstracts
unigram_datafiles = []
docs_titles = []
filenames = []
#docs_abstract = []
docs_src = []
counter = 0

for row in df_input.iterrows():

    title_text = row[1]['title']

    # raw_text = title_text
    raw_text = row[1]['text']
    src = row[1]['src']

    raw_text = remove_special_chars(raw_text)
    # Tokenise & lemmatise data w/ spacy
    doc = nlp(raw_text)
    lemmas = [
        token.lemma_.lower()
        for token in doc
        if (
            not token.is_stop
            and token.pos is not "SYM"
            and token.pos_ is not "PUNCT"
            and len(token) > 1
        )
    ]
    if len(lemmas) > 10:
        unigram_docs.append(lemmas)
        #unigram_datafiles.append(datafile)
        docs_titles.append(title_text)
        #docs_abstract.append(abs_text)
        #filenames.append(datafile)
        docs_src.append(src)
    counter += 1
    if counter % 1000 == 0:
        logger.info(
            f"Tokenised/Lemmatised {counter} of {len(df_input)} files")

print("Tokenising & Lemmatising took: " + str(int(time.time() - start)) + " seconds")

Tokenising & Lemmatising took: 118 seconds


In [77]:
# ========== Add n-grams ==========
start = time.time()

docs = deepcopy(unigram_docs)  # Make new instance before manipulating docs

# Build bigrams
bigram = Phrases(docs, min_count=30)
trigram = Phrases(bigram[docs], min_count=15)
for i in range(len(docs)):
    doc = docs[i]
    bigrams_ = [b for b in bigram[doc] if b.count("_") == 1]
    trigrams_ = [t for t in trigram[bigram[doc]] if t.count("_") == 2]
    # print(f'Found bigrams {bigrams_}')
    # print(f'Found trigrams {trigrams_}')
    docs[i] = doc + bigrams_ + trigrams_


# ===== Inspect n-grams =====
def vis_ngrams(docs_in, n_ngrams=20):

    from collections import Counter

    frequencies = Counter([])
    for text in docs_in:
        frequencies += Counter(text)

    unigram_df = pd.DataFrame(
        [{"ngram": k, "count": v}
            for k, v in frequencies.items() if "_" not in k]
    ).sort_values("count", ascending=False)
    bi_gram_df = pd.DataFrame(
        [{"ngram": k, "count": v} for k, v in frequencies.items() if "_" in k]
    ).sort_values("count", ascending=False)
    trigram_df = pd.DataFrame(
        [{"ngram": k, "count": v}
            for k, v in frequencies.items() if k.count("_") == 2]
    ).sort_values("count", ascending=False)
    print(unigram_df[:n_ngrams])
    print(bi_gram_df[:n_ngrams])
    print(trigram_df[:n_ngrams])

    # Visualise counts of top n-grams
    fig = px.bar(
        unigram_df[:n_ngrams],
        x="ngram",
        y="count",
        title="Counts of top unigrams",
        template="plotly_white",
        labels={"ngram": "Unigram", "count": "Count"},
    )
    fig.show()
    fig = px.bar(
        bi_gram_df[:n_ngrams],
        x="ngram",
        y="count",
        title="Counts of top bi-grams",
        template="plotly_white",
        labels={"ngram": "Bigram", "count": "Count"},
    )
    fig.show()
    fig = px.bar(
        trigram_df[:n_ngrams],
        x="ngram",
        y="count",
        title="Counts of top trigrams",
        template="plotly_white",
        labels={"ngram": "Trigram", "count": "Count"},
    )
    fig.show()
    return True


vis_ngrams(docs, n_ngrams=20)  # TODO - THIS IS SLOW AS SHIT


print("n-gram took: " + str(int(time.time() - start)) + " seconds")

          ngram  count
18        model   2182
359        word   1812
27        datum   1641
997       topic   1095
69          use   1082
2      learning   1059
338        user   1024
16         time    938
57         like    874
15      example    872
56    algorithm    861
37         base    838
164       write    772
137       value    749
64         need    738
1341    network    725
1076   security    714
254      vector    712
1       machine    707
141       learn    703
                       ngram  count
0           machine_learning    475
5                write_write    265
98            neural_network    241
137        anomaly_detection    203
75             deep_learning    156
95          natural_language    132
124               zero_trust    131
72                data_point    121
158          active_learning    121
78             machine_learn    120
77   artificial_intelligence    117
19   collaborative_filtering    115
3                   use_case    108
70           

n-gram took: 17 seconds


In [78]:
# ========== Further filtering ==========
start = time.time()
# Remove custom stopwords
custom_stopwords = [
    "play_important_role",
    "play_critical_role",
    "play_key_role",
    "95_confidence_interval",
    "provide_new_insight",
    "et_al",
    "pubmed_abstract",
    "publisher_text",
    "present_study",
    "results_suggest",
    "result_suggest",
    "95_ci",
    "play_important",
    "study",
    "result",
    "analysis",
    "method",
]
docs = [
    [token for token in doc if token.lower() not in custom_stopwords] for doc in docs
]

# # Visualis n-grams again
# vis_ngrams(docs)
# write out the docs for UI iterations
with open('./app/outputs/list_input_lda.txt', 'w') as f:
    for doc in docs:
        f.write("%s\n" % doc)

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

print("Number of unique tokens: %d" % len(dictionary))
print("Number of documents: %d" % len(corpus))


print("remove stop-words took: " + str(int(time.time() - start)) + " seconds")

Number of unique tokens: 1093
Number of documents: 308
remove stop-words took: 0 seconds


In [79]:
# ========== LDA - train our model with Gensim ==========

start = time.time()
# Set training parameters.
#num_topics = 3

def run_lda(eta):
    num_topics = 12
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    lda = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha=0.9,
        eta=eta,
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every,
    )

    top_topics = lda.top_topics(corpus, coherence='c_v', texts=docs, dictionary=dictionary)  # , num_words=20)
    #top_topics = lda.top_topics(corpus, coherence='u_mass')  # , num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    
    print("eta:" + str(eta))
    print("Average topic coherence: %.4f." % avg_topic_coherence)
    return avg_topic_coherence

    #print("remove stop-words took: " + str(int(time.time() - start)) + " seconds")
    
eta_t = [p/10 for p in range(0, 10)]
print(eta_t)
coh_scs = []

for eta in eta_t:
    sc = run_lda(eta)
    coh_scs.append(sc)

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
eta:0.0
Average topic coherence: 0.4650.
eta:0.1
Average topic coherence: 0.4909.


KeyboardInterrupt: 

In [95]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

title = 'LDA coherence scores with respect to number of topics'
labels = ['c_v', 'u_mass']
colors = ['rgb(67,67,67)', 'rgb(115,115,115)', 'rgb(49,130,189)', 'rgb(189,189,189)']

fig=make_subplots(rows=2,cols=1,subplot_titles=['c_v','u_mass'])

y_data_01 =  [0.19921742699847628,
 0.28079662350484474,
 0.35980371058093724,
 0.395017079201736,
 0.4821590803817702,
 0.43829619683937177,
 0.45860677830182006,
 0.4818942613262664,
 0.39528114776635404,
 0.4798636229252404,
 0.47909922816392786,
 0.5190775770023021,
 0.5329843618934249,
 0.48934076284832456,
 0.5303980874590332,
 0.47465643487712117,
 0.4866388160726669,
 0.5045434450081916,
 0.46565409150392145,
 0.48598599284530025,
 0.49349514919643056,
 0.5103408896851072,
 0.4908687552060693,
 0.4796660939604231,
 0.5095330442975221,
 0.46985626127636565,
 0.4844917466283094,
 0.49604733184726285,
 0.4773774264746204,
 0.48247793245538884]

y_data_02 = [-0.8978147181782018,
 -0.9928057788091744,
 -1.1104580738655876,
 -1.0608213337352475,
 -1.2320093303138713,
 -1.1861190198977658,
 -1.1424316771171663,
 -1.0424943377624214,
 -1.1475245129297282,
 -1.1714240778717329,
 -1.0453274630994847,
 -1.1056946547895485,
 -1.0513800652020002,
 -1.215334278746648,
 -1.0754648311409247,
 -1.2885182074179897,
 -1.1163046546247972,
 -1.0617696141618247,
 -1.0782215651872822,
 -1.1223966448032467,
 -1.1083426758328354,
 -1.257567500221343,
 -1.0501582206591515,
 -1.1265728237660053,
 -1.0631668810115849,
 -1.1102809849620843,
 -1.1664045674346477,
 -1.2160498090724865,
 -1.1077271903862107,
 -1.0876131227710812]

trace=go.Scatter(x=num_t,
                 y=y_data_01,
                 line=dict(width=2, color=colors[0]),
                 marker=dict(color=colors[1]),
                 showlegend=False
                )
fig.append_trace(trace,1,1)

trace=go.Scatter(x=num_t,
                 y=y_data_02,
                 line=dict(width=2, color=colors[0]),
                 marker=dict(color=colors[2]),
                 showlegend=False)
fig.append_trace(trace,2,1)
fig['layout'].update(height=800, width=800, title=title)

fig.show()
fig.write_html("./app/assets/lda_cv_coherence.html")


The append_trace method is deprecated and will be removed in a future version.
Please use the add_trace method with the row and col parameters.



The append_trace method is deprecated and will be removed in a future version.
Please use the add_trace method with the row and col parameters.




In [80]:
# ========== LDA - train our model with Gensim ==========

start = time.time()
# Set training parameters.
num_topics = 12
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

lda = LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha=0.9,
        eta="auto",
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every,
)

top_topics = lda.top_topics(corpus, coherence='c_v', texts=docs, dictionary=dictionary)  # , num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    
print("num_topics:" + str(num_topics))
print("Average topic coherence: %.4f." % avg_topic_coherence)


print("remove stop-words took: " + str(int(time.time() - start)) + " seconds")

num_topics:12
Average topic coherence: 0.5004.
remove stop-words took: 10 seconds


In [81]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -6.3137364866317505

Coherence Score:  0.5003597118939366


In [82]:
df_topic_coherence = pd.DataFrame(top_topics)
df_topic_coherence.columns = ['keywords', 'coherence']

list_topic_number = lda.print_topics()

ls_tn = []
for tn in list_topic_number:
    #print(tn)
    ls_tn.append((tn[0], 
                  [tn[1].split(' + ')[0].split('*')[1].replace('"', ''), 
                  tn[1].split(' + ')[1].split('*')[1].replace('"', ''), 
                  tn[1].split(' + ')[2].split('*')[1].replace('"', '')])
                 )
    
def extract_top3(col_keywords):
    
    res = [k[1] for k in col_keywords][:3]
    return res

df_topic_coherence['top3'] = df_topic_coherence['keywords'].apply(extract_top3)
df_ls_tn = pd.DataFrame(ls_tn)
df_ls_tn.columns = ['topic_number', 'top3']
df_topic_coherence['top3'] = df_topic_coherence['top3'].astype(str)
df_ls_tn['top3'] = df_ls_tn['top3'].astype(str)
df_topic_coherence = df_topic_coherence.merge(df_ls_tn, on='top3')

df_topic_coherence.to_csv("./app/outputs/df_topic_coherence.csv", index=False)


In [83]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
vis

In [84]:
pyLDAvis.save_html(vis, './app/assets/lda.html')

In [85]:
# ========== T-SNE transform ==========

start = time.time()
lda_vals = list()
for d in corpus:
    topics_tup = lda.get_document_topics(
        d
    )  # This should be a N by K matrix where N = corpus size, K = topics
    temp_dict = {i: 0 for i in range(num_topics)}
    for t in topics_tup:
        temp_dict[t[0]] = t[1]
    lda_vals.append(temp_dict)

lda_df = pd.DataFrame(lda_vals)
lda_arr = lda_df.values

lda_topics = {i[0]: i[1].split(" + ") for i in lda.print_topics(-1)}
topics_txt = [lda_topics[i] for i in range(num_topics)]
topics_txt = [[j.split("*")[1].replace('"', "")
               for j in i] for i in topics_txt]
topics_txt = ["; ".join(i) for i in topics_txt]

lda_df = lda_df.assign(topic_id=[str(lda_arr[i].argmax())
                                 for i in range(len(lda_arr))])
lda_df = lda_df.assign(
    topic_txt=[topics_txt[lda_arr[i].argmax()] for i in range(len(lda_arr))]
)
lda_df = lda_df.assign(
    topics=["Topic: " + str(lda_arr[i].argmax()) for i in range(len(lda_arr))]
)
lda_df = lda_df.assign(title=docs_titles)
lda_df = lda_df.assign(filename=docs_titles)
lda_df = lda_df.assign(src=docs_src)

# for tsne_perp in [20, 35, 50, 100, 200]:  # Test out different perplexity values
for tsne_perp in [40]:  # Test out different perplexity values
    tsne_embeds = TSNE(
        n_components=2,
        perplexity=tsne_perp,
        n_iter=350,
        n_iter_without_progress=100,
        learning_rate=500,
        random_state=42,
    ).fit_transform(lda_arr)
    lda_df = pd.concat([lda_df, pd.DataFrame(
        tsne_embeds, columns=["x", "y"])], axis=1)

    # Visualise the t-SNE topics
    topic_ids = "Topic: " + lda_df["topic_id"].astype(str).values
    fig = px.scatter(
        lda_df,
        title="t-SNE test, perplexity: " + str(tsne_perp),
        x="x",
        y="y",
        color=topic_ids,
        color_discrete_sequence=px.colors.qualitative.Light24,
        hover_name="title",
        hover_data=["topic_txt"],
        template="plotly_white",
    )
    fig.show()

lda.save("./app/outputs/lda_model")
lda_df.to_csv("./app/outputs/lda_df.csv", index=False)
with open("./app/outputs/lda_topics.json", "w") as f:
    json.dump(lda_topics, f)
    

print("T-SNE transform took: " + str(int(time.time() - start)) + " seconds")

T-SNE transform took: 1 seconds


In [86]:
df_summary = pd.read_csv("./app/outputs/summary_df.csv.gz")
print(df_summary.shape)
df_summary.head(2)

(401, 23)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,topic_txt,topics,title,filename,src,x,y,author,date,summary
0,0.326917,0.0,0.527555,0.0,0.0,0.0,0.143781,0.0,0.0,0.0,...,anomaly; detection; anomaly_detection; algorit...,Topic: 2,A Beginners Guide to Unsupervised Learning | b...,A Beginners Guide to Unsupervised Learning | b...,medium.com,0.632441,4.514701,Mathanraj Sharma,2019-08-09T12:48:35.943Z,"Before we need to train our model, we should k..."
1,0.023542,0.0,0.0,0.0,0.0,0.0,0.0,0.79855,0.0,0.0,...,topic; distance; document; word; distribution;...,Topic: 7,Importance of Distance Metrics in Machine Lear...,Importance of Distance Metrics in Machine Lear...,medium.com,-6.218726,14.404735,Alekhyo Banerjee,2020-06-07T16:33:17.714Z,The general formula for Euclidea distance in n...


In [87]:
df_summary.summary.values[0]

'Before we need to train our model, we should know how many different types of species (clusters) First, we cluster the data with different number of clusters and plot the number of clusters vs.inertia graph. \n Since we know the number of clusters let’s build a model and visualize the result. \n'

In [88]:
df_summary = lda_df.merge(df_summary[['title', 'author', 'date', 'summary']], on='title')
print(df_summary.shape)
df_summary.head(2)

(308, 23)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,topic_txt,topics,title,filename,src,x,y,author,date,summary
0,0.552072,0.010688,0.0,0.033453,0.020717,0.017713,0.012129,0.016541,0.0,0.0,...,ml; matrix; aws; dimension; variance; cluster;...,Topic: 0,A Beginners Guide to Unsupervised Learning | b...,A Beginners Guide to Unsupervised Learning | b...,medium.com,3.132126,1.852913,Mathanraj Sharma,2019-08-09T12:48:35.943Z,"Before we need to train our model, we should k..."
1,0.010015,0.024086,0.0,0.0,0.0,0.112276,0.732022,0.042991,0.025896,0.014048,...,distance; similarity; metric; vector; point; c...,Topic: 6,Importance of Distance Metrics in Machine Lear...,Importance of Distance Metrics in Machine Lear...,medium.com,6.386958,7.409829,Alekhyo Banerjee,2020-06-07T16:33:17.714Z,The general formula for Euclidea distance in n...


In [89]:
df_summary.to_csv("./app/outputs/summary_df.csv.gz", compression='gzip', index=False)