# Exploring Topic Modeling

Here we show how we explored topic modeling of tweets

In [85]:
%matplotlib inline
from Politweet import get_tweets, get_transcript
import pandas as pd
pd.set_option('display.max_colwidth', 1200)

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

tweets = get_tweets("./datasets/tweets.tsv")

## K-Means clustering with TF-IDF

Our assumption was that by using TF-IDF scores we could have created cluster of words and run k-means to identify these. However, the results were poor.

In [86]:
import collections
import string
from pprint import pprint

def k_means_cluster(texts, clusters=20):
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=1.0, lowercase=False)
    tfidf_model = vectorizer.fit_transform(texts)
    inv_map = {v: k for k, v in vectorizer.vocabulary_.items()}
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        try: clustering[label].append(inv_map[idx])
        except: pass
 
    return clustering, km_model, tfidf_model

articles = [" ".join(tweet["tokens"]) for i,tweet in tweets.iterrows()]
clusters, km_model, X = k_means_cluster(articles, 6)
# pprint(dict(clusters))

## Extracting the topics from the transcript

We use word

In [87]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition

from Politweet import get_transcript

debate = get_transcript("./datasets/transcript.csv")
sentences = [t["sentence"] for i,t in debate.iterrows()]

def extract_topic_from_transcript(sentences, num_topics = 10, num_top_words = 15):
    vectorizer = CountVectorizer(stop_words='english', min_df=10)
    dtm = vectorizer.fit_transform(sentences).toarray()
    vocab = np.array(vectorizer.get_feature_names())

    clf = decomposition.NMF(n_components=num_topics, random_state=1)
    doctopic = clf.fit_transform(dtm)

    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
        
    return topic_words

def most_relevant_topics(topics):
    for i, t in enumerate(topics):
        print "Topic {}: {}".format(i, ' '.join(t[:15]))

topics = extract_topic_from_transcript(sentences)
most_relevant_topics(topics)

print topics

Topic 0: ve going sure got make think things economy country don years time care important just
Topic 1: senator obama said general thing new plan iraq lead understand doesn minutes want security question
Topic 2: spending government got cut want care billion obama look senator new know make economy ve
Topic 3: iran senator mccain iraq nuclear need years let ve right united said difference going like
Topic 4: know right think john don ve ll united states just let need make strategy said
Topic 5: united states president going people world point way let america senate look say know issue
Topic 6: troops afghanistan ve iraq strategy al going said cut deal time difference got don war
Topic 7: tax want people cut make tell look let problem billion care american right year america
Topic 8: mccain senator just say oil president talk means like think going said economy issue did
Topic 9: nuclear work important ve point way think cut view data world security going issue america
[[u've', u'going

In [114]:
import gensim, sys, os, codecs
import pandas as pd

from nltk.corpus import brown
from Politweet import get_tweets


topics_raw = [
    ("healthcare", ['hospital', 'doctor', 'insurance']),
    ("war", ['war']),
    ("economy", ['economy']),
    ("energy", ['energy'])
]

topics = [t[1] for t in topics_raw]
topicsDict = [t[0] for t in topics_raw] + ["none"]
no_topic = len(topicsDict) - 1


def train_word2vec_brown(output="datasets/brown_word2vec.model"):
    model = gensim.models.Word2Vec(brown.sents(), min_count=1)
    model.save(output)


def load_word2vec_model(model="datasets/brown_word2vec.model"):
    if os.path.isfile(model):
        return gensim.models.Word2Vec.load(model)

def highest(scores):
    high = np.argmax(scores)
    if type(scores[high]) is not np.float64:
        high = no_topic
    elif scores[high] < 0.7:
        high = no_topic
    return high

def get_topic(model, tokens, topics):
    tokens = [token for token in tokens if token in model]
    topics = [[w for w in topic if w in model] for topic in topics]
    sims = [model.n_similarity(topic, tokens) for topic in topics]
    high = highest(sims)
    return (topicsDict[high], sims[high] if high < len(sims) else 0)

model = load_word2vec_model()
tweets = get_tweets("datasets/tweets.tsv")

    
tweets['topics'] = tweets['tokens'].apply(lambda x: get_topic(model, x, topics)[0])
tweets['topics_s'] = tweets['tokens'].apply(lambda x: get_topic(model, x, topics)[1])

tweets[['topics', 'topics_s', 'content']][ tweets['topics'] != 'none' ]

Unnamed: 0_level_0,topics,topics_s,content
tweet.id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
936475544,war,0.947707,Obama -2 it an occupation not war #tweetdebate
936479038,economy,0.735377,-1 McCain for stalling with faux sympathy #tweetdebate
936492695,energy,0.728398,McCain -1 for a total non-answer #tweetdebate
936493022,economy,0.835766,-1 McCain Contrary statements regarding the state of the economy \r#tweetdebate
936504460,economy,0.748875,McCain: the sherrif? uh? #tweetdebate #current
936504645,tax,0.836829,McCain -1 for failing to address the keystone tax cut issue #tweetdebate
936508498,tax,0.793628,#tweetdebate McPain: we're exporting jobs 'cause of hi biz taxes. NOT because of cheap labor? Oh geez...
936509906,tax,0.780857,@current #current How can you give all those tax breaks and expect the economy to not go futher into the hole?
936511049,energy,0.764864,"@current Ah yes, the pot and the kettle are debating who is to blame for the proverbial heat in the kitchen. #current"
936512032,tax,0.753789,-1 McCain for floating idea of $5k tax cuts #tweetdebate
