In [1]:
src_file = 'obama.csv'
mallet_path = '\\Mallet\\bin\\mallet'

In [2]:
num_topics = 10
num_iters = 100
show_interval = 100

In [3]:
import pandas as pd
import numpy as np
import sqlite3
import re
import random
import textman as tx
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline


In [5]:
docs = pd.read_csv(src_file, encoding = "ISO-8859-1")
docs = docs.set_index('Unnamed: 0')
docs = docs.sample(2000)

In [6]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='Text')
tokens['token_num'] = tokens.groupby(['Unnamed: 0']).cumcount()
tokens = tokens.reset_index()[['Unnamed: 0','token_num','term_id']]
tokens = tokens[tokens.term_id.isin(vocab[vocab.go].index)]
tokens = tokens.set_index(['Unnamed: 0','token_num'])

In [7]:
tokens, vocab = tx.create_tokens_and_vocab(docs, src_col='Text')
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,token,term_str,term_id
Unnamed: 0,sent_id,token_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
14539,0,4,pivotal,pivotal,21767
14539,0,5,moment,moment,18975
14539,0,8,global,global,12688
14539,0,9,fight,fight,11449
14539,0,11,HIV,hiv,13918


In [9]:
tokens['term_str'] = tokens.term_id.map(vocab.term_str)

In [10]:
vectorizer = TfidfVectorizer(use_idf=1, stop_words='english', token_pattern=r'[A-Za-z][A-Za-z][A-Za-z]+')
X = vectorizer.fit_transform(docs.Text.values.astype('U'))
v = pd.DataFrame(vectorizer.get_feature_names(), columns=['term_str'])
v['idf'] = vectorizer.idf_

In [11]:
v.sort_values('idf', ascending=False).head(10)

Unnamed: 0,term_str,idf
0,aagen,7.908255
15765,lumley,7.908255
15776,lundquist,7.908255
15774,luncheon,7.908255
15772,lunatics,7.908255
15771,lunatic,7.908255
15770,lunar,7.908255
15769,luna,7.908255
15764,luminary,7.908255
15938,mainframe,7.908255


In [37]:
cutoff = 4.5
v = v[v.idf > cutoff].sort_values('idf', ascending=False).sample(800)
my_v = v.term_str.tolist()
tokens = tokens[tokens.term_str.isin(my_v)]

In [38]:
corpus = tx.gather_tokens(tokens, level=0, col='term_str')\
    .reset_index().rename(columns={'term_str':'doc_content'})

In [39]:
corpus.head()


Unnamed: 0.1,Unnamed: 0,doc_content
0,14539,infections
1,13947,suspension suspension
2,6416,stick
3,7224,hadi hadi hadi hadi
4,10646,introducing exporters closest introducing back...


In [40]:
corpus.to_csv('obama100-corpus.csv', index=False)

In [41]:
!{mallet_path}

Mallet 2.0 commands: 
  import-dir        load the contents of a directory into mallet instances (one per file)
  import-file       load a single file into mallet instances (one per line)
  import-svmlight   load a single SVMLight format data file into mallet instances (one per line)
  info              get information about Mallet instances
  train-classifier  train a classifier from Mallet data files
  classify-dir      classify data from a single file with a saved classifier
  classify-file     classify the contents of a directory with a saved classifier
  classify-svmlight classify data from a single file in SVMLight format
  train-topics      train a topic model from Mallet data files
  infer-topics      use a trained topic model to infer topics for new documents
  evaluate-topics   estimate the probability of new documents given a trained model
  prune             remove features based on frequency or information gain
  split             divide data into testing, training, and va

In [42]:
!{mallet_path} import-file --input obama100-corpus.csv --output obama100-corpus.mallet --keep-sequence TRUE

In [43]:
!{mallet_path} train-topics --input obama100-corpus.mallet --num-topics {num_topics} --num-iterations {num_iters} \
--output-doc-topics obama100-doc-topics.txt \
--output-topic-keys obama100-topic-keys.txt \
--word-topic-counts-file obama100-word-topic-counts-file.txt \
--topic-word-weights-file obama100-topic-word-weights-file.txt \
--xml-topic-report obama100-topic-report.xml \
--xml-topic-phrase-report obama100-topic-phrase-report.xml \
--show-topics-interval {show_interval} \
--use-symmetric-alpha false  \
--optimize-interval 100 \
--diagnostics-file obama100-diagnostics.xml

Mallet LDA: 10 topics, 4 topic bits, 1111 topic mask
Data loaded.
max tokens: 137
total tokens: 3561
<10> LL/token: -6.45617
<20> LL/token: -6.3433
<30> LL/token: -6.31119
<40> LL/token: -6.28132
<50> LL/token: -6.28366
<60> LL/token: -6.26674
<70> LL/token: -6.23707
<80> LL/token: -6.26539
<90> LL/token: -6.25618

0	0.5	surprised stick devices hepatitis reception recovering christianity ambition insights tefft myers alternate admiration shoulders rangel sage interface blues classic lifestyle 
1	0.5	disclosure guaranteed borrow notwithstanding amend consolidation messy difficulty merle kayla sar aumf shortly downs candidly devote remembers dover kindness unilateral 
2	0.5	tire rolling port des uss inspections fulton rhode baltic vanaskie proposition specializing newman unscrupulous morty segments prominently toyo portsmouth intercept 
3	0.5	square victory doubts ill justin shutdown empathy caught brush exciting borrower retained champions paths pensions freshman sparked teenage dumping