In [52]:
# Configuration
data_path = "C:/mallet/mallet-2.0.8/sample-data/web/en"
model_dir = "topic_model"

# Python and Lexos imports
import os
import shlex
from subprocess import PIPE, STDOUT, CalledProcessError, Popen, check_output
from wasabi import Printer
from lexos.io.smart import Loader
from lexos import tokenizer
from lexos.topic_model.mallet import Mallet

# Make the model directory if it doesn't exist
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

# Load the data
loader = Loader()
loader.load(data_path)
print("Extract of first text:\n")
print(loader.texts[0][0:1245])

Extract of first text:

Elizabeth Needham (died 3 May 1731), also known as Mother Needham, was an English procuress and brothel-keeper of 18th-century London, who has been identified as the bawd greeting Moll Hackabout in the first plate of William Hogarth's series of satirical etchings, A Harlot's Progress. Although Needham was notorious in London at the time, little is recorded of her life, and no genuine portraits of her survive. Her house was the most exclusive in London and her customers came from the highest strata of fashionable society, but she eventually crossed the moral reformers of the day and died as a result of the severe treatment she received after being sentenced to stand in the pillory.



In [91]:
?loader.texts
# Hit 00 to restart the kernel

[1;31mType:[0m        list
[1;31mString form:[0m ["Elizabeth Needham (died 3 May 1731), also known as Mother Needham, was an English procuress and <...> riend Ness Wadia she is a co-owner of the Indian Premier League cricket team Kings XI Punjab.\n']
[1;31mLength:[0m      12
[1;31mDocstring:[0m  
Built-in mutable sequence.

If no argument is given, the constructor creates a new empty list.
The argument must be an iterable if specified.


In [75]:
docs = tokenizer.make_docs(
    loader.texts,
    model="en_core_web_sm",
    add_stopwords=["gorillas"]
)

for i, doc in enumerate(docs):
    tokens = [token.text for token in doc if token.pos_ == "ADJ" or token.pos_ == "NOUN"]
    with open(f"data_files/doc{i}", "w") as f:
        f.write(" ".join(tokens))


In [70]:
model = Mallet(
    model_dir=model_dir,
    mallet_path="C:/mallet/mallet-2.0.8/bin"
)
model.import_data(docs, allowed=["ADJ", "NOUN"])

Bagifying data...
Running C:/mallet/mallet-2.0.8/bin/mallet import-file --input
topic_model/data.txt --output topic_model/import.mallet --keep-sequence
--preserve-case --token-regex "\S+"
✔ Import complete.


In [84]:
mallet_cmd = f"C:/mallet/mallet-2.0.8/bin/mallet import-file --input topic_model/data.txt --output topic_model/import.mallet"
# mallet_cmd = f"C:/mallet/mallet-2.0.8/bin/mallet import-dir --input data_files --output topic_model/import.mallet"
mallet_cmd += " --keep-sequence"
mallet_cmd += " --preserve-case"
mallet_cmd += ' --token-regex "\S+"'
# mallet_cmd += " --remove-stopwords"

msg = Printer()
msg.text(f"Running {mallet_cmd}")
mallet_cmd = shlex.split(mallet_cmd)
# Perform the import
try:
    # shell=True required to handle backslashes in token-regex
    output = check_output(mallet_cmd, stderr=STDOUT, shell=True, universal_newlines=True)
    msg.good("Import complete.")
except CalledProcessError as e:
    output = e.output#.decode()
    msg.fail(output)

Running C:/mallet/mallet-2.0.8/bin/mallet import-file --input
topic_model/data.txt --output topic_model/import.mallet --keep-sequence
--preserve-case --token-regex "\S+"
✔ Import complete.


In [85]:
%%capture

cmd = "C:/mallet/mallet-2.0.8/bin/mallet train-topics --input topic_model/import.mallet --num-topics 20 --num-iterations 1000 --optimize-interval 10 --output-state topic_model/state.gz --output-topic-keys topic_model/keys.txt --output-doc-topics topic_model/composition.txt --word-topic-counts-file topic_model/counts.txt --output-topic-docs topic_model/topic-docs.txt --diagnostics-file topic_model/diagnostics.xml"
cmd = shlex.split(cmd)
try:
    output = check_output(cmd, stderr=STDOUT, shell=True, universal_newlines=True)
    msg.good("Training complete.")
except CalledProcessError as e:
    output = e.output#.decode()
    msg.fail(output)

In [60]:
%%capture
model.train()

Running C:/mallet/mallet-2.0.8/bin/mallet train-topics --input
topic_model/import.mallet --num-topics 20 --num-iterations 1000
--optimize-interval 10 --output-state topic_model/state.gz --output-topic-keys
topic_model/keys.txt --output-doc-topics topic_model/composition.txt
--word-topic-counts-file topic_model/counts.txt --output-topic-docs
topic_model/topic-docs.txt --diagnostics-file topic_model/diagnostics.xml
Mallet LDA: 20 topics, 5 topic bits, 11111 topic mask
Data loaded.
Topic modeling currently only supports feature sequences: use --keep-sequence option when importing data.
