# Exploring Topic Modeling

Here we show how we explored topic modeling of tweets

In [9]:
%matplotlib inline
from Politweet import get_tweets, get_transcript
import pandas as pd
pd.set_option('display.max_colwidth', 1200)

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

tweets = get_tweets("./datasets/tweets.tsv")

## K-Means clustering with TF-IDF

Our assumption was that by using TF-IDF scores we could have created cluster of words and run k-means to identify these. However, the results were poor.

In [42]:
import collections
import string
from pprint import pprint

def k_means_cluster(texts, clusters=20):
    vectorizer = TfidfVectorizer(min_df=0.01, max_df=1.0, lowercase=False)
    tfidf_model = vectorizer.fit_transform(texts)
    inv_map = {v: k for k, v in vectorizer.vocabulary_.items()}
    km_model = KMeans(n_clusters=clusters)
    km_model.fit(tfidf_model)
    clustering = collections.defaultdict(list)
 
    for idx, label in enumerate(km_model.labels_):
        try: clustering[label].append(inv_map[idx])
        except: pass
 
    return clustering, km_model, tfidf_model

articles = [" ".join(tweet["tokens"]) for i,tweet in tweets.iterrows()]
clusters, km_model, X = k_means_cluster(articles, 6)
# pprint(dict(clusters))

## Extracting the topics from the transcript

We use word

In [59]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition

from Politweet import get_transcript

debate = get_transcript("./datasets/transcript.csv")
sentences = [t["sentence"] for i,t in debate.iterrows()]

def extract_topic_from_transcript(sentences, num_topics = 10, num_top_words = 15):
    vectorizer = CountVectorizer(stop_words='english', min_df=10)
    dtm = vectorizer.fit_transform(sentences).toarray()
    vocab = np.array(vectorizer.get_feature_names())

    clf = decomposition.NMF(n_components=num_topics, random_state=1)
    doctopic = clf.fit_transform(dtm)

    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
        
    return topic_words

def most_relevant_topics(topics):
    for i, t in enumerate(topics):
        print "Topic {}: {}".format(i, ' '.join(t[:15]))

topics = extract_topic_from_transcript(sentences)
most_relevant_topics(topics)

print topics

Topic 0: ve going sure got make think things economy country don years time care important just
Topic 1: senator obama said general thing new plan iraq lead understand doesn minutes want security question
Topic 2: spending government got cut want care billion obama look senator new know make economy ve
Topic 3: iran senator mccain iraq nuclear need years let ve right united said difference going like
Topic 4: know right think john don ve ll united states just let need make strategy said
Topic 5: united states president going people world point way let america senate look say know issue
Topic 6: troops afghanistan ve iraq strategy al going said cut deal time difference got don war
Topic 7: tax want people cut make tell look let problem billion care american right year america
Topic 8: mccain senator just say oil president talk means like think going said economy issue did
Topic 9: nuclear work important ve point way think cut view data world security going issue america
[[u've', u'going

In [76]:
import gensim, sys, os, codecs
import pandas as pd

from nltk.corpus import brown
from Politweet import get_tweets

topicsDict = [
    "healthcare",
    "war",
    "economy",
    "energy",
    "tax",
    "none"
]

no_topic = len(topicsDict) - 1

topics = [
    ['health'],
    ['war'],
    ['economy'],
    ['energy'],
    ['tax']
]


def train_word2vec_brown(output="datasets/brown_word2vec.model"):
    model = gensim.models.Word2Vec(brown.sents(), min_count=1)
    model.save(output)


def load_word2vec_model(model="datasets/brown_word2vec.model"):
    if os.path.isfile(model):
        return gensim.models.Word2Vec.load(model)

def highest(scores):
    high = np.argmax(scores)
    print scores, high, scores[high], 
    if scores[high] > 0.5:
        return high
    else:
        return no_topic

def get_topic(model, tokens, topics):
    tokens = [token for token in tokens if token in model]
    topics = [[w for w in topic if w in model] for topic in topics]
    sims = [model.n_similarity(topic, tokens) for topic in topics]
    return highest(sims)

model = load_word2vec_model()
tweets = get_tweets("datasets/tweets.tsv")

    
tweets['topic'] = tweets['tokens'].apply(lambda x: get_topic(model, x, topics))
tweets[['topic', 'content']]

[0.12636150006913435, 0.14951522222546454, -0.043197817771781402, -0.18606836294029028, -0.0050136906351124604] 1 0.149515222225 [-0.2665228431788369, 0.17165042091907029, -0.093914764962146791, -0.045586865590204043, -0.20005837840570176] 1 0.171650420919 [0.33008900396183738, 0.50302189541506315, 0.48504269160162033, 0.27635524080467622, 0.14009863026813171] 1 0.503021895415 [0.052961323534288951, 0.14016547767993906, -0.10150179104339774, -0.19259171975226502, -0.089825038607552171] 1 0.14016547768 [0.52131212545507533, 0.18067339652986186, 0.52412725925443493, 0.63958855275575321, 0.48481558288069471] 3 0.639588552756 [0.07669343646836381, 0.2960685477954611, 0.16191680376308318, 0.020901672976016257, 0.048092263388477469] 1 0.296068547795 [0.13553013120848123, 0.18080230427948515, 0.062004371074420216, 0.21127318658002103, 0.0011300007495583207] 3 0.21127318658 [0.29701805429818301, 0.94770709813659793, 0.69572119190803505, 0.16189622493219605, 0.36155132415167812] 1 0.94770709813

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()