In [3]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import pyLDAvis.sklearn

import sys
import os
import logging
import re
import csv
import json
import smart_open
import codecs
import nltk
import time
import pickle
import numpy as np
from collections import OrderedDict

from text_process.tweets_text_processor import TextProcessor

In [4]:
def load_model(model_file, tweets):
    pipeline = pickle.load(open(model_file, "rb"))
    model = pipeline.transform(tweets)
    lda = pipeline.named_steps['lda']
    vect = pipeline.named_steps['vect']
    return [pipeline, model, lda, vect]

In [5]:
def get_topic_word_dist(topics, feature_names, num_words=100):
    word_dists = []  # [{word1: weight1, word2: weight2, word3: weight3....}, {word1, weight1, word2: weight2....}...]
    for topic_idx, topic in enumerate(topics):
        feature_list = OrderedDict()  # {word1: weight, word2: weight ...}
        for i in topic.argsort()[:-num_words - 1:-1]:
            feature = feature_names[i]
            weight = round(topic[i], 2)
            feature_list[feature] = weight

        word_dists.append(feature_list)
    return word_dists

In [6]:
def get_topic_dist(components, feature_names, model):
    average = np.average(np.array(model), axis=0)
    top_topics = average.argsort()[::-1]
    top_component = [components[i] for i in top_topics]
    top_topics_words = get_topic_word_dist(top_component, feature_names)

    top_topics = np.array(list(map(lambda x:x+1, top_topics)))
    result = {"avg": average, "top_topics": top_topics, "top_component": top_component, "top_topics_words":
              top_topics_words}
    return result

In [7]:
def consine_similarirty(vec1, vec2):
    numerator = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    return numerator / (norm_vec1 * norm_vec2)

In [8]:
def compare_topics(pipeline, model, num_bf_doc):
    # TODO find the right threshold
    THRESDHOLD = 20

    components = pipeline._final_estimator.components_
    feature_names = pipeline.named_steps['vect'].get_feature_names()

    # overall topic distributions
    model_t = model
    result_t = get_topic_dist(components, feature_names, model_t)

    model_bf = model[:num_bf_doc]
    result_bf = get_topic_dist(components, feature_names, model_bf)

    top_topics_bf = result_bf['top_topics'][:THRESDHOLD]

    model_af = model[num_bf_doc:]
    result_af = get_topic_dist(components, feature_names, model_af)
    top_topics_af = result_af['top_topics'][:THRESDHOLD]

    print("Topic Distribution Similarity", consine_similarirty(result_bf['avg'], result_af['avg']))
    print("Common topic", list(top_topics_bf | top_topics_af))
    unique_topic = list(set(top_topics_bf).symmetric_difference(set(top_topics_af)))
    print("Unique Topic", unique_topic)
    bf_unique = list(filter(lambda x: x in unique_topic, top_topics_bf))
    af_unique = list(filter(lambda x: x in unique_topic, top_topics_af))
    bf_unique_diff = [str(result_bf["avg"][x-1])for x in bf_unique]
    af_unique_diff = [str(result_af["avg"][x-1])for x in af_unique]
    print("BF unique topic", bf_unique, bf_unique_diff)
    print("AF unique topic", af_unique, af_unique_diff)

    print("\n\n")
    # TODO print unique topics for period

    return result_t, result_bf, result_af

In [9]:
def show_topic_dist(result, period, num_topics=20):
    print("{} topic distribution: \n".format(period))

    top_topics = result["top_topics"]
    print("top_topics: \n", ' | '.join([str(top_topics[i]) + " - " + str(result["avg"][top_topics[i]-1]) for i in range(num_topics)]))
    for i, words in enumerate([', '.join(list(dist.keys())[:15]) for dist in result["top_topics_words"][:num_topics]]):
        print("topic " + str(top_topics[i]) + " top words: \n", words)
    print("\n\n")

In [10]:
def show_topics(result_t, result_bf, result_af):
    # overall topic distributions
    show_topic_dist(result_t, "overall")

    show_topic_dist(result_bf, "before")

    show_topic_dist(result_af, "after")


In [11]:
def extract_tweet_with_period(incident, input_paths, output_path, metadata_path):
    """
    extract tweets before and after disaster start
    input_paths are two paths, geotagged and timeline folders
    """
    file_bf = os.path.join(output_path, "{}_tweets_bf.csv".format(incident))
    file_af = os.path.join(output_path, "{}_tweets_af.csv".format(incident))

    if os.path.isfile(file_bf) and os.path.isfile(file_af):
        logging.debug("Tweets already extracted")
        corpus = []
        files = [file_bf, file_af]
        for file in files:
            with open(file,'r') as file:
                lines = file.readlines()
                tweets = lines[1:]
                corpus.append(tweets)

            logging.debug(tweets[0:5])

    else:
        text_proc = TextProcessor(metadata_path, input_paths, output_path, model="topic")
        corpus = text_proc.process_tweets(incident)

    return corpus

In [12]:
input_paths = ["/Users/stellachoi/Box Sync/research_work/events_tweets/Event - 319 - Moore Tornado/geotagged_from_archive/",
               "/Users/stellachoi/Box Sync/research_work/events_tweets/Event - 319 - Moore Tornado/user_timelines/"]
incident_metadata_path = '/Users/stellachoi/Box Sync/research_work/disaster_analysis/incident_metadata.csv'
output_path = "/Users/stellachoi/Box Sync/research_work/disaster_analysis/data"
incident = 319
corpus = extract_tweet_with_period(incident, input_paths, output_path, incident_metadata_path)
tweets = corpus[0][:]
tweets.extend(corpus[1])
print("tweet_bf_len: {} tweet_af_len: {}".format(len(corpus[0]), len(corpus[1])))

tweet_bf_len: 435723 tweet_af_len: 769034


In [13]:
model_file = "backup/topic_pipeline_100_0305_1.p"
[pipeline, model, lda, vect] = load_model(model_file, tweets)
[result_t, result_bf, result_af] = compare_topics(pipeline, model, len(corpus[0]))
show_topics(result_t, result_bf, result_af)


Topic Distribution Similarity 0.998423624339
Common topic [98, 64, 73, 4, 73, 111, 111, 26, 70, 33, 115, 87, 94, 87, 111, 127, 90, 95, 100, 63]
Unique Topic [2, 11, 19, 96, 100, 45]
BF unique topic [11, 96, 45] ['0.0121278772737', '0.0118833073899', '0.0118066496072']
AF unique topic [19, 2, 100] ['0.0140305296143', '0.0127287698619', '0.0121966581746']



overall topic distribution: 

top_topics: 
 98 - 0.0220655046756 | 64 - 0.017818341449 | 73 - 0.0172755935108 | 4 - 0.0171673339216 | 9 - 0.0168327662759 | 47 - 0.0163389056468 | 68 - 0.0162061543709 | 26 - 0.0155795146073 | 70 - 0.0152073790755 | 33 - 0.0150172470159 | 19 - 0.0131830849531 | 99 - 0.0131816157404 | 82 - 0.0131725678466 | 5 - 0.0131366171254 | 90 - 0.0130080250889 | 79 - 0.0129834762354 | 94 - 0.0126127475065 | 2 - 0.0123904865884 | 55 - 0.0123878957868 | 25 - 0.0118983118106
topic 98 top words: 
 day, life, school, head, alreadi, hey, end, gone, left, spend, sign, thunder, fit, hurri, univers
topic 64 top words: 
 lo

In [14]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, vect.transform(corpus[0]), vect, sort_topics=False, mds='tsne')
panel

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


In [15]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, vect.transform(corpus[1]), vect, sort_topics=False, mds='tsne')
panel

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
