In [1]:
import pandas as pd
import numpy as np
import time
# LDA, tSNE
from sklearn.manifold import TSNE
from gensim.models.ldamodel import LdaModel
# NLTK
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import re
# Visualization
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
%matplotlib inline
import seaborn as sns
# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, CustomJS, ColumnDataSource, Slider
from bokeh.layouts import column
from bokeh.palettes import all_palettes
output_notebook()

In [2]:
df = pd.read_csv('Tweets.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,likes,time
0,0,"Spinraza, a drug for spinal muscular atrophy, ...",0,2021-05-23 08:32:01
1,1,@SonuSood This child from Bikaner(Rajasthan) n...,0,2021-05-23 08:19:31
2,2,This child from Bikaner(Rajasthan) need your h...,0,2021-05-23 03:38:59
3,3,Alabama lawmakers: Get Spinal Muscular Atrophy...,0,2021-05-23 03:11:10
4,4,Children with spinal muscular atrophy may expe...,12,2021-05-23 02:09:01


In [4]:
df['time'] = df['time'].astype('datetime64[ns]')
df['time'] = df['time'].astype(np.int64) / int(1e6)

In [5]:
print(df.tweets[0][:500])

Spinraza, a drug for spinal muscular atrophy, has no money saved for retirement.


**Initial cleaning**

In [6]:
%%time
# Removing numerals:
df['tweet_tokens'] = df.tweets.map(lambda x: re.sub(r'\d+', '', x))
# Lower case:
df['tweet_tokens'] = df.tweet_tokens.map(lambda x: x.lower())
print(df['tweet_tokens'][0][:500])

spinraza, a drug for spinal muscular atrophy, has no money saved for retirement.
Wall time: 31.3 ms


**Tokenize**

In [7]:
%%time
df['tweet_tokens'] = df.tweet_tokens.map(lambda x: RegexpTokenizer(r'\w+').tokenize(x))
print(df['tweet_tokens'][0][:25])

['spinraza', 'a', 'drug', 'for', 'spinal', 'muscular', 'atrophy', 'has', 'no', 'money', 'saved', 'for', 'retirement']
Wall time: 0 ns


**Stemming**

In [8]:
%%time
snowball = SnowballStemmer("english")  
df['tweet_tokens'] = df.tweet_tokens.map(lambda x: [snowball.stem(token) for token in x])
print(df['tweet_tokens'][0][:25])

['spinraza', 'a', 'drug', 'for', 'spinal', 'muscular', 'atrophi', 'has', 'no', 'money', 'save', 'for', 'retir']
Wall time: 353 ms


**Stop Words**

In [9]:
%%time
stop_en = stopwords.words('english')
df['tweet_tokens'] = df.tweet_tokens.map(lambda x: [t for t in x if t not in stop_en]) 
print(df['tweet_tokens'][0][:25])

['spinraza', 'drug', 'spinal', 'muscular', 'atrophi', 'money', 'save', 'retir']
Wall time: 260 ms


**Final Cleaning**

In [10]:
%%time
df['tweet_tokens'] = df.tweet_tokens.map(lambda x: [t for t in x if len(t) > 1])
print(df['tweet_tokens'][0][:25])

['spinraza', 'drug', 'spinal', 'muscular', 'atrophi', 'money', 'save', 'retir']
Wall time: 4 ms


**LDA**

In [11]:
from gensim import corpora, models
np.random.seed(2017)
texts = df['tweet_tokens'].values
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
ldamodel = models.ldamodel.LdaModel(corpus, id2word=dictionary, 
                                    num_topics=8, passes=5, minimum_probability=0)

In [12]:
ldamodel.print_topics()

[(0,
  '0.020*"muscular" + 0.020*"atrophi" + 0.019*"spinal" + 0.018*"https" + 0.018*"co" + 0.013*"type" + 0.012*"research" + 0.011*"children" + 0.007*"avail" + 0.007*"drug"'),
 (1,
  '0.047*"https" + 0.046*"co" + 0.034*"atrophi" + 0.034*"spinal" + 0.034*"muscular" + 0.031*"help" + 0.020*"need" + 0.018*"rais" + 0.015*"cr" + 0.015*"fightssma"'),
 (2,
  '0.026*"muscular" + 0.026*"spinal" + 0.026*"atrophi" + 0.020*"https" + 0.020*"co" + 0.020*"suffer" + 0.019*"birth" + 0.019*"sinc" + 0.015*"bikan" + 0.015*"sma"'),
 (3,
  '0.023*"atrophi" + 0.023*"spinal" + 0.022*"muscular" + 0.020*"link" + 0.019*"zolgensma" + 0.018*"https" + 0.018*"co" + 0.016*"drug" + 0.013*"amp" + 0.013*"help"'),
 (4,
  '0.028*"muscular" + 0.028*"co" + 0.027*"https" + 0.027*"atrophi" + 0.026*"spinal" + 0.024*"old" + 0.021*"need" + 0.020*"save" + 0.019*"rare" + 0.017*"life"'),
 (5,
  '0.053*"co" + 0.052*"https" + 0.034*"spinal" + 0.034*"muscular" + 0.034*"atrophi" + 0.027*"help" + 0.023*"plz" + 0.019*"cr" + 0.017*"old" + 

In [13]:
hm = np.array([[y for (x,y) in ldamodel[corpus[i]]] for i in range(len(corpus))])

In [14]:
#And reduce dimensionality using t-SNE algorithm:
tsne = TSNE(random_state=2017, perplexity=30, early_exaggeration=120)
embedding = tsne.fit_transform(hm)
embedding = pd.DataFrame(embedding, columns=['x','y'])
embedding['hue'] = hm.argmax(axis=1)

**PLOTTING**

In [15]:
source = ColumnDataSource(
        data=dict(
            x = embedding.x,
            y = embedding.y,
            colors = [all_palettes['Set1'][8][i] for i in embedding.hue],
            title = df.tweets,
            day = df.time,
            alpha = [0.9] * embedding.shape[0],
            size = [7] * embedding.shape[0]
        )
    )
hover_tsne = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Title:</span>
            <span style="font-size: 12px">@title</span>
            <span style="font-size: 12px; font-weight: bold;">Day:</span>
            <span style="font-size: 12px">@time</span>
        </div>
    </div>
    """)
tools_tsne = [hover_tsne, 'pan', 'wheel_zoom', 'reset']
plot_tsne = figure(plot_width=700, plot_height=700, tools=tools_tsne, title='Papers')
plot_tsne.circle('x', 'y', size='size', fill_color='colors', 
                 alpha='alpha', line_alpha=0, line_width=0.01, source=source, name="df")

callback = CustomJS(args=dict(source=source), code=
    """
    var data = source.data;
    var f = cb_obj.value
    x = data['x']
    y = data['y']
    colors = data['colors']
    alpha = data['alpha']
    title = data['title']
    time = data['time']
    size = data['size']
    for (i = 0; i < x.length; i++) {
        if (time[i] <= f) {
            alpha[i] = 0.9
            size[i] = 7
        } else {
            alpha[i] = 0.05
            size[i] = 4
        }
    }
    source.change.emit();
    """)

slider = Slider(start=df.time.min(), end=df.time.max(), value=2016, step=1, title="Day of tweets")
slider.js_on_change('value', callback)

layout = column(slider, plot_tsne)
show(layout)