In [36]:
import json
import pandas as pd

tweets_file = 'websummit_dump_20151106155110'
with open(tweets_file) as f:
    tweets = json.load(f)
    
print('# of tweets:', len(tweets))

tweet_text = [tweet['text'] for tweet in tweets]
df = pd.DataFrame({'text': tweet_text})
df.head()

# of tweets: 77111


Unnamed: 0,text
0,@sarahtavel What #MustHave #tech gadget can yo...
1,Start-ups from every continent heading to #web...
2,I'm at the #WebSummit2015 this week. On ali(at...
3,@jalak What #MustHave #tech gadget can you not...
4,#websummit is about to kickoff in #dublin! Wha...


In [27]:
import re

class TweetPreprocessor(object):

    def __init__(self):
        self.FLAGS = re.MULTILINE | re.DOTALL
        self.ALLCAPS = '<allcaps>'
        self.HASHTAG = '<hashtag>'
        self.URL = '<url>'
        self.USER = '<user>'
        self.SMILE = '<smile>'
        self.LOLFACE = '<lolface>'
        self.SADFACE = '<sadface>'
        self.NEUTRALFACE = '<neutralface>'
        self.HEART = '<heart>'
        self.NUMBER = '<number>'
        self.REPEAT = '<repeat>'
        self.ELONG = '<elong>'

    def _hashtag(self, text):
        text = text.group()
        hashtag_body = text[1:]
        if hashtag_body.isupper():
            result = (self.HASHTAG + " {} " + self.ALLCAPS).format(hashtag_body)
        else:
            result = " ".join([self.HASHTAG] + re.findall(r"(?=[A-Z])", hashtag_body, flags=self.FLAGS))
        return result

    def _allcaps(self, text):
        text = text.group()
        return text.lower() + ' ' + self.ALLCAPS

    def preprocess(self, text):
        eyes, nose = r"[8:=;]", r"['`\-]?"

        re_sub = lambda pattern, repl: re.sub(pattern, repl, text, flags=self.FLAGS)

        text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", self.URL)
        text = re_sub(r"/"," / ")
        text = re_sub(r"@\w+", self.USER)
        text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), self.SMILE)
        text = re_sub(r"{}{}p+".format(eyes, nose), self.LOLFACE)
        text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), self.SADFACE)
        text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), self.NEUTRALFACE)
        text = re_sub(r"<3", self.HEART)
        text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", self.NUMBER)
        text = re_sub(r"#\S+", self._hashtag)
        text = re_sub(r"([!?.]){2,}", r"\1 " + self.REPEAT)
        text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 " + self.ELONG)

        text = re_sub(r"([A-Z]){2,}", self._allcaps)

        return text.lower()

In [28]:
tweet_processor = TweetPreprocessor()

# an example:
tweet = "@sarahtavel What #MustHave #tech gadget can you not travel without? Stop by stand D131 on Wed at #WebSummit #Dublin https://t.co/2wFLAVpGiV"
print("Before: " + tweet + "\n")
print("After: " + tweet_processor.preprocess(tweet))

Before: @sarahtavel What #MustHave #tech gadget can you not travel without? Stop by stand D131 on Wed at #WebSummit #Dublin https://t.co/2wFLAVpGiV

After: <user> what <hashtag>   <hashtag> gadget can you not travel without? stop by stand d<number> on wed at <hashtag>   <hashtag>  <url>


In [38]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

tknzr = TweetTokenizer()
nltk.download('stopwords')
stop = stopwords.words('english')
stop += ['<hashtag>', '<url>', '<allcaps>', '<number>', '<user>', '<repeat>', '<elong>', 'websummit']

df['text_processed'] = ""
index = 0

for tweet in df['text']:
    parts = tknzr.tokenize(tweet_processor.preprocess(tweet))
    clean = [i for i in parts if i not in stop]
    df['text_processed'][index] = clean
    index += 1

[nltk_data] Downloading package stopwords to /home/gpa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [52]:
df['text_processed'].size

77111

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

tweet_texts_processed = [str.join(" ", tweet_preprocessed ) for tweet_preprocessed in df['text_processed']]

vectorizer = TfidfVectorizer(min_df=4, max_features = 10000)
vz = vectorizer.fit_transform(tweet_texts_processed)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=50, random_state=0)
svd_tfidf = svd.fit_transform(vz)

from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

## MiniBatchKMeans

In [None]:
from sklearn.cluster import MiniBatchKMeans

num_clusters = 10
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

In [None]:
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Web Summit 2015 tweets (k-means)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:10000])

hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
show(plot_kmeans)

## KMeans

In [None]:
from sklearn.cluster import KMeans

num_clusters = 10
kmeans_model = KMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)

kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

In [None]:
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=900, plot_height=700, title="Web Summit 2015 tweets (k-means)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:10000])

hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - cluster: @cluster)"}
show(plot_kmeans)

## LDA

In [None]:
n_topics = 15
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(vz)

tsne_lda = tsne_model.fit_transform(X_topics)

In [None]:
plot_lda = bp.figure(plot_width=900, plot_height=700, title="Web Summit 2015 tweets (LDA)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], 
                 color=colormap[lda_keys][:10000])

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"tweet": "@tweet (processed: \"@processed\" - topic: @topic_key)"}
show(plot_lda)