In [1]:
import pandas as pd

In [2]:
df = pd.read_json('../data/posts_preprocessed.json')

In [3]:
posts = df.prep_body.values.tolist()

## Count Vectorizer / TFIDF

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
ngrams = (1,2)

# COUNT VECTORIZER
# vectorizer = CountVectorizer(ngram_range=ngrams)

# TFIDF
vectorizer = TfidfVectorizer(ngram_range=ngrams, max_df=.2)

vectorizer.fit(posts)
x = vectorizer.transform(posts)
x.shape

(19506, 551964)

## LDA Model

In [7]:
# Run LDA
no_topics = 30
lda = LatentDirichletAllocation(n_topics=no_topics).fit(x)



In [8]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print("\n".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        print('\n')

In [9]:
no_top_words = 10
feature_names = vectorizer.get_feature_names()
display_topics(lda, feature_names, no_top_words)

Topic 0:
christian
seek work
metal vocalist
highway
paist
manson
christian rock
tim job
paid posit
christian band


Topic 1:
de
la
en
para
que
lo
el
con
un
music


Topic 2:
lesson lo
rock rol
songbook
be
warrior
weekend warrior
dirt
go go
bluz
bucket


Topic 3:
aveng
sevenfold
aveng sevenfold
show practic
punk altern
lin
romanc
ratt
chemic
chemic romanc


Topic 4:
book agent
agent
interest contact
follow link
record video
rock origin
henderson academi
academi music
prayer
click link


Topic 5:
michael jackson
threat
zip
ross
trump
look blue
matchbox twenti
minor threat
dunn
rv


Topic 6:
music
record
studio
work
contact
lesson
song
artist
show
guitar


Topic 7:
regga
tour
swiss
fulli fullwood
fullwood
hawaii
bass fulli
regga artist
hawaiian
record


Topic 8:
pittsburgh
slipknot
group play
drummer band
need place
jean
warn
etc must
good look
thank hope


Topic 9:
band
look
play
rock
music
player
like
drummer
guitar
bass


Topic 10:
pm
god
sunday
musician
thursday
servic
friday
tuesday
c

## topic exploration

In [10]:
import numpy as np

In [11]:
topics_matrix = lda.transform(x)

In [12]:
def explorer(tm, topic_num, posts):    
    indices = np.argsort(tm[:,topic_num])
    # get top "posts" most probabilistic posts in this topic
    top_indices = indices[len(indices):len(indices)-posts-1:-1]
    for rank, idx in enumerate(top_indices):
        print("POST RANK: ", rank+1)
        print("POST PROB: ", topics_matrix[idx,topic_num])
        print("URL: ",df.url.iloc[idx])
        print("POST CONTENT: ", '\n', df.body.iloc[idx], '\n')
        print("---------------------------------------", '\n')

In [16]:
explorer(topics_matrix, 18, 20)

POST RANK:  1
POST PROB:  0.546969696973
URL:  https://stlouis.craigslist.org/muc/d/6-month-white-pit-bull/6341518154.html
POST CONTENT:  
 6 month white pit bullshe's a pit female good with kids  and partly potty trained 

--------------------------------------- 

POST RANK:  2
POST PROB:  0.501190476191
URL:  https://neworleans.craigslist.org/muc/d/alienwolf/6344458545.html
POST CONTENT:  
 ALIENWOLFnew orleans musician alienwolf  latest recording ghosthorn . your invited to listen and download for free. 

explore the frequencies of your mind.

 www.reverbnation.com/alienwolf 

--------------------------------------- 

POST RANK:  3
POST PROB:  0.501190476191
URL:  https://sanantonio.craigslist.org/muc/d/alienwolf/6354806607.html
POST CONTENT:  
 ALIENWOLFnew orleans musician alienwolf latest recording ghosthorn . your invited to listen and download for free. 

explore the frequencies of your mind.

www.reverbnation.com/alienwolf 

--------------------------------------- 

POST RANK:

In [17]:
import pickle 
pkg = [lda.components_, topics_matrix]
with open("./models/lda_12grams_30n-dedup.pkl", "wb") as f:
    pickle.dump(pkg, f)