# Imports

In [1]:
from datetime import time
from pathlib import Path

# gensim
import gensim
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from matplotlib import pyplot as plt

In [2]:
MALLET_PATH = r"C:\mallet\bin\mallet"  # set to where your "bin/mallet" path is

In [3]:
tm_models_dir = Path(r"C:\Users\martin\git\master-thesis\6_models")
coherence_models_dir = Path(r"C:\Users\martin\git\master-thesis\7_evaluation\gensim\models")

Files and Folders

In [4]:
wdir = Path("../..")

coherence_models_dir = wdir.joinpath("7_evaluation", "gensim", "models")
corpusdir = wdir.joinpath("5_corpus")
evaluationdir = wdir.joinpath("6_evaluation")
coherencedir = wdir.joinpath("7_evaluation", "gensim")

# Functions

## Topic Modeling

In [5]:
def create_tm(dictionary, corpus, topic_count, type="gensim"):
    if type == "mallet":
        print("--Starting Topic Modeling (Mallet)...")
        return gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, corpus=corpus, num_topics=topic_count,
                                                id2word=dictionary)
    elif type == "gensim":
        print("--Starting Topic Modeling (Gensim)...")
        return LdaMulticore(corpus, num_topics=topic_count, id2word=dictionary, workers=11)


def read_file(file):
    docs = []
    for l in file.open().readlines():
        doc = l.split(" ")
        docs.append(doc)
    return docs


def get_file_count(dir, name):
    return str(sum(1 for f in dir.glob(name+"*")) + 1)


def create_model_path(formatdir, overwrite):
    if overwrite:
        return formatdir.joinpath(f"tm.bin")
    else:
        count = get_file_count(formatdir, "tm")
        return formatdir.joinpath(f"tm_{count}.bin")


def save_tm_model(lda_model, filename, type, topic_count, format, overwrite=True):
    format = filename.split("-")[0]
    # creating subfolders
    modelsdir = evaluationdir.joinpath("models", type)
    segdir = modelsdir.joinpath(f"seglen-{seglen}")
    topicsdir = segdir.joinpath(f"topics-{topic_count}")
    topicsdir.mkdir(exist_ok=True, parents=True)
    formatdir = topicsdir.joinpath(format)
    formatdir.mkdir(exist_ok=True, parents=True)
    # outfile path
    outfile_model = create_model_path(formatdir, "tm", overwrite)
    # write file
    lda_model.save(str(outfile_model))
    print(f"--saved TM to: {outfile_model}")


def save_coherence_model(lda_model, texts, filename, type, measure):
    coh_model = CoherenceModel(model=lda_model, texts=texts, coherence=measure)
    format = filename.split("-")[0]
    # creating subfolders
    modelsdir = evaluationdir.joinpath("models", type)
    segdir = modelsdir.joinpath(f"seglen-{seglen}")
    topicsdir = segdir.joinpath(f"topics-{topic_count}")
    formatdir = topicsdir.joinpath(format)
    formatdir.mkdir(exist_ok=True, parents=True)
    # outfile path
    outfile_model = formatdir.joinpath(f"coh-{measure.replace('_', '')}.bin")
    coh_model.save(str(outfile_model))
    print(f"--saved Coherence to: {outfile_model}")


# Start TM

In [7]:
type = "mallet"  # "mallet" | "gensim" | "all"
seglen = 500
topic_count = 60
measure = "c_v"
format = "original"
overwrite = False
# coherence graph
create_coherence_graph = True

corpus_files = corpusdir.glob(f"seglen-{seglen}/{format}*.txt")

corpus_files = Path(r"C:\Users\martin\git\master-thesis\5_corpus\seglen-500").glob("*.txt")

for file in corpus_files:
    print("File: " + file.stem)
    # splitting lines to docs
    documents = read_file(file)
    # build a dictionary
    print("--Creating Dictionary")
    dictionary = corpora.Dictionary(documents)
    # Turns each document into a bag of words.
    print("--Creating Doc2Bow")
    corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in documents]
    lda_model = create_tm(dictionary, corpus, topic_count, type)
    save_tm_model(lda_model, file.stem, type, topic_count, overwrite=overwrite)
    # create coherence model
    save_coherence_model(lda_model, documents, file.stem, type, measure, overwrite=overwrite)
print("Finished Modeling")

File: tkn-JJ_NN_NNS_VV_VVD_VVG_VVN_VVP_VVZ
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Mallet)...


TypeError: save_tm_model() missing 1 required positional argument: 'format'

# Single Topic Models

In [None]:
type = "mallet"  # "mallet" | "gensim" | "all"
seglen = 500
topic_count = 60
measure = "c_v"
format = "original"
overwrite = False
# coherence graph
create_coherence_graph = True

corpus_files = corpusdir.glob(f"seglen-{seglen}/*.txt")

corpus_files = Path(r"C:\Users\martin\git\master-thesis\5_corpus\seglen-500").glob("original*.txt")

for file in corpus_files:
    print("File: " + file.stem)
    # splitting lines to docs
    documents = read_file(file)
    # build a dictionary
    print("--Creating Dictionary")
    dictionary = corpora.Dictionary(documents)
    # Turns each document into a bag of words.
    print("--Creating Doc2Bow")
    corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in documents]
    lda_model = create_tm(dictionary, corpus, topic_count, type)
    save_tm_model(lda_model, file.stem, type, topic_count, overwrite=overwrite)
    # create coherence model
    save_coherence_model(lda_model, documents, file.stem, type, measure, overwrite=overwrite)
print("Finished Modeling")

# Coherence Variance

In [4]:
def coherence_variance(file, iteration, topic_count, measure, type):
    print("File: " + file.stem)
    # splitting lines to docs
    documents = read_file(file)
    # build a dictionary
    print("--Creating Dictionary")
    dictionary = corpora.Dictionary(documents)
    # Turns each document into a bag of words.
    print("--Creating Doc2Bow")
    corpus = [dictionary.doc2bow(doc, allow_update=True) for doc in documents]
    lda_model = create_tm(dictionary, corpus, topic_count, type)
    # write file
    tmfile  = r"C:\Users\martin\git\master-thesis\6_evaluation\coherence-variance\gensim\tm_" + str(iteration) + ".bin"
    lda_model.save(tmfile)
    # create coherence model
    cohfile = r"C:\Users\martin\git\master-thesis\6_evaluation\coherence-variance\gensim\coh_" + str(iteration) + ".bin"
    coh_model = CoherenceModel(model=lda_model, texts=documents, coherence=measure)
    coh_model.save(cohfile)

In [5]:
type = "gensim"  # "mallet" | "gensim" | "all"
seglen = 500
topic_count = 60
measure = "c_v"

for i in range(0, 10):
    print("Iteration: ", i)
    coherence_variance(Path(r"C:\Users\martin\git\master-thesis\5_corpus\seglen-500\original-500.txt"), i, topic_count, measure, type)

Iteration:  0
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  1
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  2
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  3
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  4
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  5
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  6
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  7
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gensim)...
Iteration:  8
File: original-500
--Creating Dictionary
--Creating Doc2Bow
--Starting Topic Modeling (Gen

In [None]:
path = Path(r"C:\Users\martin\git\master-thesis\6_evaluation\coherence-variance")
coh_models = {}
for file in path.glob("coh*.bin"):
    print(file.stem)
    coh_models.update({file.stem: CoherenceModel.load(str(file))})
print("finished")

In [None]:
coh_models

# Multiple Coherences

In [None]:
def compute_coherence_graph(type, texts, start, limit, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    # build a dictionary
    print("--Creating Dictionary")
    dictionary = corpora.Dictionary(texts)
    # Turns each document into a bag of words.
    print("--Creating Doc2Bow")
    corpus = [dictionary.doc2bow(doc) for doc in texts]
    print("Creating coherence Graph...")
    for num_topics in range(start, limit, step):
        model = create_tm(dictionary=dictionary, corpus=corpus, type=type, topic_count=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    # Show graph
    x = range(start, limit, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    # Print the coherence scores
    for m, cv in zip(x, coherence_values):
        print("Num Topics =", m, " has Coherence Value of", round(cv, 4))


start = 200
limit = 300
step = 10

corpus_files = Path(r"C:\Users\martin\git\master-thesis\5_corpus\test")


for file in corpus_files.glob("*.txt"):
    print("File: " + file.stem)
    # splitting lines to docs
    texts = read_file(file)
    compute_coherence_graph(type=type, texts=texts, start=start,
                            limit=limit, step=step)
