In [56]:
import pandas as pd

data = pd.read_csv('data', sep=",", header=None)

data.columns = ['text']

data.head()

Unnamed: 0,text
0,From: gld@cunixb.cc.columbia.edu (Gary L Dare)...
1,From: atterlep@vela.acs.oakland.edu (Cardinal ...
2,From: miner@kuhub.cc.ukans.edu\nSubject: Re: A...
3,From: atterlep@vela.acs.oakland.edu (Cardinal ...
4,From: vzhivov@superior.carleton.ca (Vladimir Z...


## Base LDA

- Vectorize the data with a default TfidfVectorizer
- Train an 2 component LDA with your vectorized data
- Visualize your LDA

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer()

data_vectorized = vectorizer.fit_transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('the', 180.37116226116748), ('to', 92.15927022794975), ('of', 87.90906476617911), ('and', 70.98867266659153), ('in', 70.57325256750963), ('that', 65.85996951937074), ('is', 64.04727547590463), ('it', 46.80290514975343), ('you', 44.43560629812037), ('edu', 39.12129060550549)]
Topic 1:
[('testing', 1.3782089358093348), ('utk', 1.2850231691721934), ('r1w2', 1.027238769903009), ('khettry', 1.0272387698397973), ('tennessee', 1.0272387698175), ('23064rfl', 1.0272387696360914), ('pub', 0.9373670685478455), ('howell', 0.8426601843791295), ('gilligan', 0.8426601843789115), ('qtr', 0.7354172433564813)]


## Optimize LDA

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

vectorizer = TfidfVectorizer(stop_words = 'english',min_df = 0.05, max_df = 0.75, lowercase=True)

data_vectorized = vectorizer.fit_transform(data['text'])

lda_model = LatentDirichletAllocation(n_components=2)

lda_vectors = lda_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
        

print_topics(lda_model, vectorizer)

Topic 0:
[('god', 78.67723932740716), ('edu', 52.34211313278209), ('people', 43.91066094768297), ('jesus', 40.608883363875314), ('church', 34.97455121089752), ('com', 33.0197371904419), ('christians', 31.7626620800247), ('christian', 31.35244028028787), ('believe', 30.42202871766024), ('does', 29.893306162732525)]
Topic 1:
[('edu', 62.444782813884295), ('ca', 62.37552333096973), ('team', 46.13051901300613), ('game', 44.16106145324211), ('hockey', 43.795644442723635), ('university', 33.61363130009653), ('nhl', 32.077804227477415), ('play', 31.20808946983414), ('posting', 29.41968684313877), ('year', 29.367387830234406)]


## Predict topic of new text

- Vectorize the example
- Pass the vectors in the lda model using `transform`
- Use the code to print the predictions the LDA model

In [86]:
example = ["My team performed poorly last season. Their best player was out injured and only played one game"]

example_vectorized = vectorizer.transform(example)

lda_vectors = lda_model.transform(example_vectorized)

print("topic 0 :", lda_vectors[0][0])
print("topic 1 :", lda_vectors[0][1])

topic 0 : 0.149326969624671
topic 1 : 0.850673030375329


## Tag the texts

In [87]:
def tag_topics(text):
    text = [" ".join(text)]
    vectorized = vectorizer.transform(text)
    lda_vectors = lda_model.transform(vectorized)
    sport = lda_vectors[0][0]
    religion = lda_vectors[0][1]
    if sport > religion:
        return "sport"
    else:
        return "religion"

In [88]:
tag_topics(example)

'religion'

In [89]:
data['tag'] = data.text.apply(tag_topics)

data.tail()

Unnamed: 0,text,tag
1194,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,religion
1195,From: golchowy@alchemy.chem.utoronto.ca (Geral...,religion
1196,From: jayne@mmalt.guild.org (Jayne Kulikauskas...,religion
1197,From: sclark@epas.utoronto.ca (Susan Clark)\nS...,religion
1198,From: lmvec@westminster.ac.uk (William Hargrea...,religion


In [90]:
print(data.text.iloc[0])

From: gld@cunixb.cc.columbia.edu (Gary L Dare)
Subject: Stan Fischler, 4/4
Summary: From the Devils pregame show, prior to hosting the Penguins
Nntp-Posting-Host: cunixb.cc.columbia.edu
Reply-To: gld@cunixb.cc.columbia.edu (Gary L Dare)
Organization: PhDs In The Hall
Lines: 32


At the Lester Patrick Awards lunch, Bill Torrey mentioned that one of his
options next season is to be president of the Miami team, with Bob Clarke
working for him.  At the same dinner, Clarke said that his worst mistake
in Philadelphia was letting Mike Keenan go -- in retrospect, almost all
players came realize that Keenan knew what it took to win.  Rumours are
now circulating that Keenan will be back with the Flyers.

Nick Polano is sick of being a scapegoat for the schedule made for the
Red Wings; After all, Bryan Murray approved it.

Gerry Meehan and John Muckler are worried over the Sabres' prospects;
Assistant Don Lever says that the Sabres have to get their share now,
because a Quebec dynasty is emerging