In [55]:
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim.utils import simple_preprocess
from gensim import corpora, models

from nltk.stem import WordNetLemmatizer
import nltk
from nltk.tokenize import word_tokenize

In [74]:
df = pd.read_csv('data/labeled_prelim_processed.csv')
df.head()

Unnamed: 0,Id,Text,Relevancy,Urgency
0,247434,millions afghanistan even zero attack isis sym...,0,0
1,294115,last post brother make social media phone go v...,2,1
2,24622,listen local officials epa help harvey respons...,0,0
3,37807,damn proud tirelessly help fellow texans affec...,3,0
4,37386,help harvey disaster response help victims nat...,0,0


In [75]:
#just keep the text of tweets filtered by relevancy or urgency
df_text = df.loc[df['Urgency'] == 2]['Text']

word_lists = []
for s in df_text:
    word_lists.append(s.split())

In [76]:
dictionary = gensim.corpora.Dictionary(word_lists)
count = 0

#now show the most frequently-tweeted words
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

#now filter out words that show up too infrequently or too frequently
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

0 call
1 conroe
2 evacuations
3 harvey
4 houstonfloodi
5 lake
6 link
7 record
8 release
9 response
10 cnn


In [77]:
bow_corpus = [dictionary.doc2bow(doc) for doc in word_lists]

In [78]:
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=5, id2word=dictionary)

In [79]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.540*"flood" + 0.460*"harvey"
Topic: 1 Word: 0.809*"harvey" + 0.191*"flood"
Topic: 2 Word: 0.642*"harvey" + 0.358*"flood"
Topic: 3 Word: 0.796*"flood" + 0.204*"harvey"
Topic: 4 Word: 0.944*"flood" + 0.056*"harvey"


In [14]:
#now use LDA outputs to extract features and add to the labeled dataframe
num_topics = 15
df = pd.read_csv('data/labeled_prelim_processed.csv').dropna()
df['Relevancy'] = df['Relevancy'].astype(np.int32)
df['Urgency'] = df['Urgency'].astype(np.int32)

scores = []
for i in range(num_topics):
    scores.append([])

for t in df['Text']:
    bow_vector = dictionary.doc2bow(t.split())
    dist = [0.0]*num_topics
    
    for topic, score in lda_model.get_document_topics(bow_vector):
        dist[topic] = score
    
    for i in range(num_topics):
        scores[i].append(dist[i])

#add to the dataframe
for i in range(num_topics):
    df['top' + str(i)] = scores[i]
df.to_csv('data/labeled_prelim_lda.csv', index=False)
df.head()

Unnamed: 0,Id,Text,Relevancy,Urgency,top0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10,top11,top12,top13,top14
0,247434,millions afghanistan even zero attack isis sym...,0,0,0.0,0.326608,0.0,0.098502,0.089703,0.0,0.0,0.0,0.158985,0.0,0.088616,0.0,0.0,0.0,0.191432
1,294115,last post brother make social media phone go v...,2,1,0.0,0.0,0.2675,0.0,0.0,0.0,0.0,0.0,0.145833,0.106667,0.0,0.106667,0.0,0.0,0.306667
2,24622,listen local officials epa help harvey respons...,0,0,0.0,0.0,0.314598,0.132971,0.0,0.258455,0.202309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,37807,damn proud tirelessly help fellow texans affec...,3,0,0.0,0.0,0.088902,0.088699,0.0,0.11002,0.0,0.154831,0.0,0.410966,0.0,0.0,0.0,0.0,0.096582
4,37386,help harvey disaster response help victims nat...,0,0,0.0,0.0,0.211954,0.282543,0.0,0.0,0.0,0.22222,0.0,0.0,0.109938,0.0,0.0,0.0,0.106679


In [29]:
hdp_model = models.HdpModel(bow_corpus, id2word=dictionary)

In [30]:
for idx, topic in hdp_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.139*water + 0.132*flood + 0.130*help + 0.091*harvey + 0.075*go + 0.059*houston + 0.046*need + 0.044*hurricaneharvey + 0.037*volunteer + 0.034*tx
Topic: 1 Word: 0.232*get + 0.105*make + 0.085*hurricaneharvey + 0.083*volunteer + 0.074*water + 0.059*people + 0.053*need + 0.049*hurricane + 0.038*harvey + 0.034*houstonflood
Topic: 2 Word: 0.161*storm + 0.117*harvey + 0.087*thank + 0.066*houstonflood + 0.063*rain + 0.059*hurricane + 0.053*houston + 0.051*still + 0.046*go + 0.036*need
Topic: 3 Word: 0.186*home + 0.126*texas + 0.090*rescue + 0.078*storm + 0.070*rain + 0.061*flood + 0.060*still + 0.037*tx + 0.034*need + 0.030*victims
Topic: 4 Word: 0.177*go + 0.134*home + 0.097*still + 0.086*houstonflood + 0.065*hurricane + 0.055*damage + 0.048*volunteer + 0.036*victims + 0.034*tx + 0.033*rain
Topic: 5 Word: 0.196*people + 0.168*volunteer + 0.124*houston + 0.103*hurricane + 0.057*help + 0.052*rescue + 0.043*hurricaneharvey + 0.040*flood + 0.029*make + 0.026*go
Topic: 6 Word: 0.