# Imports

In [79]:
import time
from pathlib import Path

# gensim
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel, LdaMulticore

import pandas as pd
import numpy as np

# Parameters (need to be set)

In [80]:
MALLET_PATH = r"C:\mallet\bin\mallet"  # set to where your "bin/mallet" path is

# LDA parameters
NUM_TOPICS = 10
ITERATIONS = 10
INTERVAL = 10  # topics-10 is MALLET's default
START = 20
LIMIT = NUM_TOPICS
STEP = 10
#
seglen = "500"

type = "gensim" # "mallet" | "all"
coherence = True
measure = "c_v"

In [81]:
tm_models_dir = Path(r"C:\Users\martin\git\master-thesis\6_models")
coherence_models_dir = Path(r"C:\Users\martin\git\master-thesis\7_evaluation\gensim\models")

Files and Folders

In [82]:
wdir = Path("../..")

coherence_models_dir = wdir.joinpath("7_evaluation", "gensim", "models")
corpusdir = wdir.joinpath("5_corpus")
evaluationdir = wdir.joinpath("6_evaluation")
coherencedir = wdir.joinpath("7_evaluation", "gensim")

# Functions

## Topic Modeling

In [86]:
def create_tm(dictionary, corpus, type):
    if type == "mallet":
         print("--Starting Topic Modeling (Mallet)...")
         return gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=NUM_TOPICS,
                                            id2word=dictionary)
    elif type == "gensim":
        print("--Starting Topic Modeling (Gensim)...")
        return LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, workers=12)


def read_file(file):
    docs = []
    for l in file.readlines():
        doc = l.split(" ")
        docs.append(doc)
    return docs


def save_tm_model(lda_model, filename, type, topic_count):
    format = filename.split("-")[0]
    # creating subfolders
    modelsdir = evaluationdir.joinpath("models", type)
    segdir = modelsdir.joinpath(f"seglen-{seglen}")
    topicsdir = segdir.joinpath(f"topics-{topic_count}")
    topicsdir.mkdir(exist_ok=True, parents=True)
    formatdir = topicsdir.joinpath(format)
    formatdir.mkdir(exist_ok=True, parents=True)
    # outfile path
    outfile_model = formatdir.joinpath(f"tm.bin")
    # write file
    lda_model.save(str(outfile_model))
    print(f"--saved TM to: {outfile_model}")


def save_coherence_model(lda_model, texts, filename, type, measure):
    coh_model = CoherenceModel(model=lda_model, texts=texts, coherence=measure)
    format = filename.split("-")[0]
    # creating subfolders
    modelsdir = evaluationdir.joinpath("models", type)
    segdir = modelsdir.joinpath(f"seglen-{seglen}")
    topicsdir = segdir.joinpath(f"topics-{topic_count}")
    formatdir = topicsdir.joinpath(format)
    formatdir.mkdir(exist_ok=True, parents=True)
    # outfile path
    outfile_model = formatdir.joinpath(f"coh-{measure.replace('_', '')}.bin")
    coh_model.save(str(outfile_model))
    print(f"--saved Coherence to: {outfile_model}")

    #print(f"Overall Coherence: ", coh_model.get_coherence())
    #print(f"Topic Coherences:\n", coh_model.get_coherence_per_topic())

# TM Parameters

In [84]:
type = "gensim" # "mallet" | "gensim" | "all"
seglen = 500
topic_count = 50
coherence = True
measure = "c_v"

# Start TM

In [87]:
start = time.time()
for file in corpusdir.glob(f"*-{seglen}.txt"):
    with file.open("r") as f:
        print("File: " + file.stem)
        # splitting lines to docs
        texts = read_file(f)
        # build a dictionary
        print("--Creating Dictionary")
        dictionary = corpora.Dictionary(texts)
        # Turns each document into a bag of words.
        print("--Creating Doc2Bow")
        corpus = [dictionary.doc2bow(doc) for doc in texts]
        lda_model = create_tm(dictionary, corpus, type)
        save_tm_model(lda_model, file.stem, type, topic_count)
        # create coherence model if enabled
        if coherence:
            # create coherence model
            save_coherence_model(lda_model, texts, file.stem, type, measure)
print("Finished Modeling")
end = time.time()
print(f"Elapsed timed: {end - start}")

File: frq-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
--saved TM to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\frq\tm.bin
--saved Coherence to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\frq\coh-cv.bin
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
--saved TM to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\original\tm.bin
--saved Coherence to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\original\coh-cv.bin
File: src-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
--saved TM to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\src\tm.bin
--saved Coherence to: ..\..\6_evaluation\models\gensim\seglen-500\topics-50\src\coh-cv.bin
Finished Modeling
Elapsed timed: 58.08111119270325


In [5]:
def get_coherence_models(seglen, topic_count):
    models = {}
    for file in coherence_models_dir.glob(f"*_{seglen}-{topic_count}*.bin"):
        print(f"--{file.name}")
        model = CoherenceModel.load(str(file))
        models.update({file.stem: model})
    return models

In [6]:
coherence_models = get_coherence_models(seglen, topic_count)

--frq_500-1.bin
--original_500-1.bin
--src_500-1.bin


In [8]:
data = {}
for name, model in coherence_models.items():
    d = {name: [model.get_coherence()]}
    data.update(d)

models_df = pd.DataFrame(data)
models_df.head()

Unnamed: 0,frq_500-1,original_500-1,src_500-1
0,0.227331,0.302424,0.25351


In [9]:
data = {}
for name, model in tm_models.items():
    d = {name: model.show_topics()}
    data.update(d)

models_df = pd.DataFrame(data)
models_df.head()

Unnamed: 0,frq_1,model_1,original_1,src_1
0,"(0, 0.007*""time"" + 0.006*""hand"" + 0.005*""eye"" ...","(0, 0.007*""time"" + 0.006*""hand"" + 0.005*""eye"" ...","(0, 0.007*""time"" + 0.006*""hand"" + 0.005*""eye"" ...","(0, 0.007*""time"" + 0.006*""hand"" + 0.005*""eye"" ..."


In [20]:
for name, model in coherence_models.items():
    print(model.get_coherence_per_topic())
    print()

[0.2273314785136141]

[0.30242439859419223]

[0.25350978014942205]

