In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import json 

from gensim.parsing.preprocessing import STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer


STOPWORDS = list(set(STOPWORDS))

### Loading tweets

In [3]:
data = json.load(open('../Data/gnip_sample2m.json'))
print "The length of data is {}".format(len(data))
text = [item[u'twitter_text'] for item in data]

The length of data is 2027707


### Removing duplicates

In [4]:

def deduplication_tweets(tweets):
    '''
    tweets: should be a list of text
    '''
    tweets_deduplicated = list(set(tweets))
    return tweets_deduplicated

In [5]:
%time tweets_deduplicated = deduplication_tweets(text)
print "The length of new set of tweets is {}".format(len(tweets_deduplicated))

CPU times: user 956 ms, sys: 4 ms, total: 960 ms
Wall time: 960 ms
The length of new set of tweets is 1380434


In [43]:
remove_numbers = re.compile(ur'(\b\d\S+)' , re.IGNORECASE | re.MULTILINE)
remove_url = re.compile(ur"(\bhttp\S+)", re.IGNORECASE | re.MULTILINE)
remove_user_mention = re.compile(ur'(\B@\S+)', re.IGNORECASE | re.MULTILINE)
remove_rt = re.compile(ur'(^rt\s+)', re.IGNORECASE | re.MULTILINE)
remove_mutiple_spaces = re.compile( ur'\s+', re.IGNORECASE | re.MULTILINE)
remove_isolated_hash = re.compile(ur'(\s+#\s+)', re.MULTILINE | re.IGNORECASE)
remove_hashtag = re.compile(ur'(\B#\S+)', re.IGNORECASE | re.MULTILINE)
remove_appostrophe_b4 = re.compile(ur'\'(?![a-z])', re.MULTILINE | re.IGNORECASE)
remove_appostrophe_after = re.compile(ur'(?<![a-z])\'', re.MULTILINE | re.IGNORECASE)
remove_punctn_b4 = re.compile(ur'"(?![a-z])', re.MULTILINE | re.IGNORECASE)
remove_punctn_after = re.compile(ur'(?<![a-z])"', re.MULTILINE | re.IGNORECASE)
remove_dollar = re.compile(ur'\$(?![a-z])', re.MULTILINE | re.IGNORECASE)

def translate_non_alphanumerics(to_translate, translate_to=u' '):
    not_letters_or_digits = u'!"%\()&*+,-/:;<=>?[\]^_.`{|}~'
    if not isinstance(to_translate, unicode):
        to_translate = unicode(to_translate)
        
    if isinstance(to_translate, unicode):
        translate_table = dict((ord(char), unicode(translate_to))
                               for char in not_letters_or_digits)
    else:
        assert isinstance(to_translate, str)
        translate_table = string.maketrans(not_letters_or_digits,
                                           translate_to
                                              *len(not_letters_or_digits))
    return to_translate.translate(translate_table)


def tweet_parser(tweet):
    
    tweet = [tweet_text.lower().strip() for tweet_text in tweet]
    tweet = map( lambda x: re.sub(remove_numbers, ' ', x) , tweet) 
    tweet = map( lambda x: re.sub(remove_url, ' ', x) , tweet) 
    tweet = map( lambda x: re.sub(remove_user_mention, ' ', x) , tweet)
    tweet = map( lambda x: re.sub(remove_rt, ' ', x), tweet)
    tweet = map( lambda x: re.sub(remove_hashtag, ' ', x),  tweet)
    tweet = map( lambda x: re.sub(remove_appostrophe_b4, ' ', x) , tweet)
    tweet = map( lambda x: re.sub(remove_appostrophe_after, ' ', x) , tweet)
    tweet = map( lambda x: re.sub(remove_punctn_b4, ' ', x) , tweet)
    tweet = map( lambda x: re.sub(remove_punctn_after, ' ', x) , tweet)
    tweet = map( translate_non_alphanumerics , tweet)
    tweet = map( lambda x: re.sub(remove_mutiple_spaces, ' ', x),  tweet)
    tweet = map( lambda x: re.sub(remove_isolated_hash, ' ', x),  tweet)
    tweet = map( lambda x: re.sub(remove_dollar, ' ', x),  tweet)
    
    return tweet
    
    
    
    
    

In [44]:
%time tweet_parsed = tweet_parser(tweets_deduplicated)

CPU times: user 1min 51s, sys: 744 ms, total: 1min 52s
Wall time: 1min 51s


In [None]:
%time tweet_tokens = map(preprocess_tfidf , tweet)


In [None]:

from collections import Counter
tweet_collections = []
for i in tweet_tokens:
    tweet_collections.extend(i)

word_count = Counter(tweet_collections)
rare_words = [word for word, count in word_count.iteritems() if count <= 3]

In [39]:
STOPWORDS = STOPWORDS + rare_words

In [37]:
# stop_twitter = [i for i in  tfidf_vectorizer.get_feature_names()[:17]]
# stop_twitter_2 =  [i for i in tfidf_vectorizer.get_feature_names()[12:25]]
# STOPWORDS = STOPWORDS + stop_twitter + stop_twitter_2

def preprocess_tfidf(tweet):
    
    tweet_list = tweet.split()
    tweet_list = [w for w in tweet_list if not w.isdigit()]
    return tweet_list

CPU times: user 1min 39s, sys: 636 ms, total: 1min 39s
Wall time: 1min 38s


In [45]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(
                                stop_words=STOPWORDS , analyzer='word', max_df = 0.9 , 
                                use_idf=True, tokenizer=preprocess_tfidf)

%time tfidf_matrix = tfidf_vectorizer.fit_transform(tweet_parsed) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 15.7 s, sys: 212 ms, total: 15.9 s
Wall time: 15.8 s
(1380434, 64295)


In [47]:
from sklearn.cluster import KMeans

num_clusters = 51

km = KMeans(n_clusters=num_clusters , verbose=1 , n_jobs=-1)

%time km.fit(tfidf_matrix)



Initialization complete
Initialization complete
Initialization complete
Initialization complete
Iteration  0, inertia 1341588.077
Iteration  0, inertia 1336310.738
Iteration  0, inertia 1335755.316
Iteration  0, inertia 1336907.761
Iteration  1, inertia 1318982.907
Iteration  1, inertia 1322973.460
Iteration  1, inertia 1326269.823
Iteration  1, inertia 1324117.392
Iteration  2, inertia 1314599.946
Iteration  2, inertia 1309793.589
Iteration  2, inertia 1315207.500
Iteration  2, inertia 1316138.917
Iteration  3, inertia 1304179.527
Iteration  3, inertia 1310076.645
Iteration  3, inertia 1305588.057
Iteration  3, inertia 1309714.104
Iteration  4, inertia 1300603.805
Iteration  4, inertia 1307622.688
Iteration  4, inertia 1301265.540
Iteration  4, inertia 1305106.231
Iteration  5, inertia 1298299.737
Iteration  5, inertia 1306429.150
Iteration  5, inertia 1302532.935
Iteration  5, inertia 1299428.550
Iteration  6, inertia 1296815.289
Iteration  6, inertia 1304966.321
Iteration  6, inerti

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=51, n_init=10,
    n_jobs=-1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=1)

In [48]:
import cPickle
cPickle.dump(km , open('../Data/kmeans_51.pkl' , 'wb') , protocol=2)

In [53]:
tfidf_idx2vocab = {v:k for k,v in tfidf_vectorizer.vocabulary_.iteritems()}
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    
    cluster_words = order_centroids[i, :11]
    words = [tfidf_idx2vocab[_] for _ in cluster_words]
    
    print("Cluster {} , words {}".format(i, words))
    print ()

Cluster 0 , words [u'private', u'prisons', u'contracts', u'prison', u'tell', u'end', u'took', u'student', u'action', u'loans', u'server']
()
Cluster 1 , words [u'baby', u'friend', u'new', u'like', u"i'm", u'football', u'rt', u'young', u's', u'@', u"don't"]
()
Cluster 2 , words [u'day', u'great', u'today', u"it's", u'friend', u'good', u'happy', u'football', u'trump', u'learning', u'like']
()
Cluster 3 , words [u'dell', u'inspiron', u'laptop', u'xps', u'emc', u'ultrabook', u'latitude', u'#', u'deals', u'technologies', u'giveaway']
()
Cluster 4 , words [u'hillary', u'clinton', u'trump', u'obama', u"clinton's", u's', u'campaign', u'donald', u'rt', u'birther', u'vote']
()
Cluster 5 , words [u'intel', u'core', u'i7', u'desktop', u'i3', u'hp', u'lenovo', u'#', u'ram', u'ssd', u'new']
()
Cluster 6 , words [u'lumia', u'nokia', u'windows', u'lte', u'smartphone', u'verizon', u'gsm', u'wireless', u'phones', u'deals', u'unlocked']
()
Cluster 7 , words [u'i5', u'core', u'intel', u'elitebook', u'gen'

In [63]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [61]:
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [62]:
# Fit the NMF model

%time nmf = NMF(n_components=51, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_matrix)

CPU times: user 27min 50s, sys: 9.82 s, total: 28min
Wall time: 25min 25s


In [64]:
print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, 21)


Topics in NMF model:
Topic #0:
trump donald vote media supporters tax president campaign says poll birther racist rally america said jr voters watch support deplorable foundation
Topic #1:
desktop core windows pc duo wifi fast ram tower dvd intel dual dell optiplex pro ghz lcd monitor hd i5 deals
Topic #2:
baby girl boy happy oh birthday cute little beautiful shower yeah come bridget miss got seat sweet blue toddler closer pull
Topic #3:
new fashion week york london style street runway spring nyfw collection models times ss17 looks trends hijabs september designer history city
Topic #4:
checked download mpoints donate distributor wireless today metropcs cricket mobile nearest earn hertz watchers app weight store depot rent avis official
Topic #5:
hillary clinton campaign poll clinton's birther health video pneumonia says voters vote started movement emails supporters trail foundation ohio sick email
Topic #6:
friend happy dear you're birthday old asking ur hello new close got partner 

In [66]:
#define vectorizer parameters
from gensim.parsing.preprocessing import STOPWORDS as sp
tfidf_vectorizer_new = TfidfVectorizer(
                                stop_words=sp , analyzer='word', max_df = 0.9 , min_df = 11 ,
                                use_idf=True, tokenizer=preprocess_tfidf)

%time tfidf_matrix_new = tfidf_vectorizer_new.fit_transform(tweet_parsed) #fit the vectorizer to synopses

print(tfidf_matrix_new.shape)

CPU times: user 13.2 s, sys: 144 ms, total: 13.3 s
Wall time: 13.2 s
(1380434, 64295)


In [67]:
print(tfidf_matrix_new.shape)

(1380434, 35528)


In [71]:
lda = LatentDirichletAllocation(n_topics=51, max_iter=5,
                                learning_method='online',  learning_offset=50.,
                                random_state=0 )
%time lda.fit(tfidf_matrix_new)

CPU times: user 2h 3min 29s, sys: 2.14 s, total: 2h 3min 32s
Wall time: 2h 3min 31s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=51, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [72]:
cPickle.dump(lda , open('../Data/lda_51.pkl', 'wb') , protocol=2)

In [74]:
tf_feature_names = tfidf_vectorizer_new.get_feature_names()
print_top_words(lda, tf_feature_names, 11)

Topic #0:
education run special literally account results talks diet wake road boost
Topic #1:
trump donald president obama you're things hillary rt clinton racist trump's
Topic #2:
love today download college checked join guys hey learn story family
Topic #3:
hillary clinton history chicken action fries trump lying death foreign s
Topic #4:
great getting tips better phone pay stay try awesome event proud
Topic #5:
right party campaign wendy's school trump birther data high book gop
Topic #6:
hours second golf enjoy protect sent foods mistakes sit lit creating
Topic #7:
thanks think racism i'm doesn't follow having supporters friends anti makes
Topic #8:
look check hear color mass shirt hand guns tmobile glad dont
Topic #9:
i'll audible problem free yesterday fix amazon pm uk mlk gone
Topic #10:
end ready date lose friday sunday saturday sep tuesday outside semester
Topic #11:
come team support thing women play let's app group rugby men
Topic #12:
vote americans r d deal eat pretty e t