In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pprint
import re
from textblob import TextBlob

# plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

import nltk
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
disaster = pd.read_csv('socialmedia-disaster-tweets-DFE.csv', encoding='ISO-8859-1')
disaster.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [3]:
disaster.shape

(10876, 13)

In [4]:
df = disaster[['text', 'choose_one']]

In [5]:
df.head()

Unnamed: 0,text,choose_one
0,Just happened a terrible car crash,Relevant
1,Our Deeds are the Reason of this #earthquake M...,Relevant
2,"Heard about #earthquake is different cities, s...",Relevant
3,"there is a forest fire at spot pond, geese are...",Relevant
4,Forest fire near La Ronge Sask. Canada,Relevant


In [6]:
# Glimpse of text
def glimpse_text(target, a):
    res =  list(df[df['choose_one'] == target]['text'][:a])
    return res

In [7]:
print(glimpse_text('Relevant', 5))

['Just happened a terrible car crash', 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Heard about #earthquake is different cities, stay safe everyone.', 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all', 'Forest fire near La Ronge Sask. Canada']


In [8]:
print(glimpse_text('Not Relevant', 5))

["They'd probably still show more life than Arsenal did yesterday, eh? EH?", 'Hey! How are you?', "What's up man?", 'I love fruits', 'Summer is lovely']


In [9]:
print(glimpse_text("Can't Decide", 5))

['Why is there an ambulance right outside my work', '@MisfitRarity misfit got bombed', '@RockBottomRadFM Is one of the challenges on Tough Enough rescuing people from burning buildings?', '? High Skies - Burning Buildings ? http://t.co/uVq41i3Kx2 #nowplaying', 'What if we used drones to help firefighters lead people out of burning buildings/ help put the fire out?']


In [10]:
# Target variable distribution
data = [go.Bar(
        x = df.choose_one.unique(),
        y = df.choose_one.value_counts().values,
        marker = dict(colorscale = 'Jet', color = df.choose_one.value_counts().values),
        text = 'Target Count'
)]

layout = go.Layout(
    title = 'Target Variable Distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [11]:
# remove pattern function
def remove_pattern(input_text, pattern):
    r = re.findall(pattern=pattern, string=input_text)
    
    for i in r:
        input_text = re.sub(i, '', input_text)
    
    return input_text

In [12]:
# Removing all words starting with @
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['text'], '@[\w]*')

In [13]:
# Removing special characters, punctuations, numbers, links
def clean_tweet(text_field):
    df[text_field] = df[text_field].str.replace(r'http\S+', '')
    df[text_field] = df[text_field].str.replace(r'http', '')
    df[text_field] = df[text_field].str.replace(r'https\S+', '')
    df[text_field] = df[text_field].str.replace(r'https', '')
    df[text_field] = df[text_field].str.replace("[^a-zA-Z0-9@\'\"]", " ")
    df[text_field] = df[text_field].str.replace(r'@', 'at')
    df[text_field] = df[text_field].str.lower()
    return df

In [14]:
df = clean_tweet('tidy_tweet')

In [15]:
# Sentiment polarity, length, word count
df['polarity'] = df['tidy_tweet'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['tidy_tweet'].astype(str).apply(len)
df['word_count'] = df['tidy_tweet'].apply(lambda x: len(str(x).split()))

In [16]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake M...,Relevant,our deeds are the reason of this earthquake m...,0.0,69,13
2,"Heard about #earthquake is different cities, s...",Relevant,heard about earthquake is different cities s...,0.25,64,9
3,"there is a forest fire at spot pond, geese are...",Relevant,there is a forest fire at spot pond geese are...,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7


In [17]:
# random tweets with highest positive sentiment polarity
print(df[df['polarity'] == 1.00]['tidy_tweet'].sample(5).values)

["don't let  wmata  metro derail your day  get a text every morn when you wake up with the best route to work    sms"
 'fleshgod apocalypse   blinded by fear  mini drum cover   this cover is awesome www'
 '   you just keep ur head in the sand john  the best place for it   lbr after 97 landslide  couldnt imagine situ now'
 'rt owenrbroadhurst rt juanmthompson  at this hour 70 yrs ago one of the greatest acts of mass murder in world hist    '
 "i would rather dwell in the land of famine and be in god's perfect will than to rest in the confines of egypt    "]


In [18]:
# random tweets with neutral sentiment polarity
print(df[df['polarity'] == 0.00]['tidy_tweet'].sample(5).values)

['japan marks 70th anniversary of hiroshima atomic bombing  bells tolled in hiroshima on thursday as japan marke    '
 'if you find your patio table umbrella and chairs flipped over and suspect foul play  instead of windstorm  you may be a suspense writer '
 'meltdown' '40 displaced by ocean township apartment fire  newyork   '
 'we walk the plank of a sinking ship']


In [19]:
# random tweets with highest negative sentiment polarity
print(df[df['polarity'] == -1.00]['tidy_tweet'].sample(5).values)

['  worst feel in ds when u panicking during  boss fight and you chug two estus and the boss kills you while u drink'
 ' horrible  accident man died in wings airplane  29 07 2015   watchthevideo '
 'ashes 2015  australia   s collapse at trent bridge among worst in history  england bundled out australia for 60     '
 'supermarket chains recording worst  injury rates among asx100 companies  safety '
 'ashes 2015  australia   s collapse at trent bridge among worst in history  england bundled out australia for 60     ']


## Dist. plot of polarity score

In [20]:
data = [go.Histogram(
        x = df.polarity,
        marker=dict(color='rgba(12, 98, 12, 0.6)')
)]

layout = go.Layout(
    title='Polarity Score Distribution',
    xaxis=dict(title='Polarity Score'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

- Vast majority of tweets have a polarity score of 0.

## Review length distribution

In [21]:
data = [go.Histogram(
        x = df.review_len,
        marker=dict(color='rgba(52, 98, 52, 0.6)')
)]

layout = go.Layout(
    title='Review Length Distribution',
    xaxis=dict(title='Review Length'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

## Word Length Distribution

In [22]:
data = [go.Histogram(
        x = df.word_count,
        marker=dict(color='rgba(52, 48, 10, 0.6)')
)]

layout = go.Layout(
    title='Word Length Distribution',
    xaxis=dict(title='Word Count'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

## Distribution of sentiment polarity score by tweet type

In [113]:
x0 = df[df['choose_one'] == 'Relevant']['polarity']
x1 = df[df['choose_one'] == 'Not Relevant']['polarity']
x2 = df[df['choose_one'] == "Can't Decide"]['polarity']

trace1 = go.Histogram(
    x = x0,
    name = 'Relevant',
    opacity = 1.00
)

trace2 = go.Histogram(
    x = x1,
    name = 'Not Relevant',
    opacity = 0.75
)

trace3 = go.Histogram(
    x = x2,
    name = "Can't Decide",
    opacity = 0.75
)

data = [trace1, trace2, trace3]
layout = go.Layout(
    barmode='overlay',
    title='Distribution of sentiment polarity of tweets based on their disaster relevancy',
)
fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='overlay-hist')

## Distribution of review length based on type of tweet

In [34]:
x0 = df[df['choose_one'] == 'Relevant']['review_len']
x1 = df[df['choose_one'] == 'Not Relevant']['review_len']
x2 = df[df['choose_one'] == "Can't Decide"]['review_len']

trace1 = go.Histogram(
    x=x0,
    name='Relvant',
    opacity=0.75
)

trace2 = go.Histogram(
    x=x1,
    name='Not Relevant',
    opacity=0.75
)

trace3 = go.Histogram(
    x=x2,
    name="Can't Decide",
    opacity=0.75
)

data=[trace1, trace2, trace3]

layout = go.Layout(
    barmode='overlay',
    title='Distribution of review length of tweets based on their disaster relevancy'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='overlay-hist')

## Distribution of word counts based on tweet type

In [35]:
x0 = df[df['choose_one'] == 'Relevant']['word_count']
x1 = df[df['choose_one'] == 'Not Relevant']['word_count']
x2 = df[df['choose_one'] == "Can't Decide"]['word_count']

trace1 = go.Histogram(
    x=x0,
    name='Relvant',
    opacity=0.75
)

trace2 = go.Histogram(
    x=x1,
    name='Not Relevant',
    opacity=0.75
)

trace3 = go.Histogram(
    x=x2,
    name="Can't Decide",
    opacity=0.75
)

data=[trace1, trace2, trace3]

layout = go.Layout(
    barmode='overlay',
    title='Distribution of word counts of tweets based on their disaster relevancy'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='overlay-hist')

## Top unigrams without stopword removal

In [108]:
def get_top_n_words(corpus, n):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [93]:
top_unigrams = get_top_n_words(df['tidy_tweet'], 20)
top_unigrams

[('the', 4621),
 ('to', 2837),
 ('in', 2811),
 ('of', 2610),
 ('and', 2024),
 ('is', 1392),
 ('you', 1288),
 ('for', 1246),
 ('on', 1239),
 ('it', 1142),
 ('my', 975),
 ('that', 854),
 ('with', 799),
 ('by', 777),
 ('at', 749),
 ('this', 705),
 ('from', 615),
 ('are', 603),
 ('be', 596),
 ('was', 554)]

In [106]:
# plot
df2 = pd.DataFrame(top_unigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Blackbody', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top unigrams without stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

## Top unigrams after stopwords removal

In [109]:
def get_top_n_words_sw_removed(corpus, n):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [110]:
top_unigrams = get_top_n_words_sw_removed(df['tidy_tweet'], 20)
top_unigrams

[('amp', 510),
 ('like', 493),
 ('just', 459),
 ('new', 330),
 ('news', 290),
 ('people', 284),
 ('don', 256),
 ('emergency', 229),
 ('video', 228),
 ('disaster', 220),
 ('police', 199),
 ('body', 178),
 ('suicide', 177),
 ('burning', 171),
 ('storm', 171),
 ('rt', 168),
 ('crash', 167),
 ('time', 165),
 ('attack', 164),
 ('got', 161)]

In [111]:
# plot
df2 = pd.DataFrame(top_unigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Blackbody', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top unigrams after stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

## Top bigrams before stopword removal

In [114]:
def get_top_n_bigrams(corpus, n):
    vec = CountVectorizer(ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [115]:
top_bigrams = get_top_n_bigrams(df['tidy_tweet'], 20)
top_bigrams

[('in the', 422),
 ('of the', 367),
 ('on the', 187),
 ('to the', 177),
 ('to be', 158),
 ('for the', 129),
 ('at the', 123),
 ('and the', 116),
 ('going to', 96),
 ('by the', 95),
 ('you re', 93),
 ('suicide bomber', 91),
 ('if you', 89),
 ('is the', 88),
 ('it was', 87),
 ('this is', 85),
 ('to get', 80),
 ('will be', 76),
 ('more than', 73),
 ('have been', 71)]

In [125]:
# plot
df2 = pd.DataFrame(top_bigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Viridis', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top bigrams without stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

## Top bigrams after stopword removal

In [128]:
def get_top_n_bigrams_sw_removed(corpus, n):
    vec = CountVectorizer(ngram_range=(2,2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [129]:
top_bigrams = get_top_n_bigrams_sw_removed(df['tidy_tweet'], 20)
top_bigrams

[('suicide bomber', 91),
 ('burning buildings', 81),
 ('liked video', 60),
 ('northern california', 57),
 ('cross body', 55),
 ('suicide bombing', 53),
 ('oil spill', 52),
 ('year old', 51),
 ('mass murder', 48),
 ('heat wave', 46),
 ('looks like', 46),
 ('california wildfire', 46),
 ('natural disaster', 45),
 ('mass murderer', 45),
 ('bomber detonated', 44),
 ('wild fires', 44),
 ('pkk suicide', 43),
 ('70 years', 42),
 ('16yr old', 42),
 ('detonated bomb', 42)]

In [130]:
# plot
df2 = pd.DataFrame(top_bigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Viridis', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top bigrams after stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

## Top Trigrams before stopword removal

In [132]:
def get_top_n_trigrams(corpus, n):
    vec = CountVectorizer(ngram_range=(3,3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [133]:
top_trigrams = get_top_n_trigrams(df['tidy_tweet'], 20)
top_trigrams

[('pkk suicide bomber', 42),
 ('suicide bomber who', 42),
 ('bomber who detonated', 42),
 ('who detonated bomb', 42),
 ('detonated bomb in', 42),
 ('northern california wildfire', 41),
 ('16yr old pkk', 41),
 ('old pkk suicide', 41),
 ('more homes razed', 40),
 ('homes razed by', 40),
 ('the latest more', 39),
 ('latest more homes', 39),
 ('razed by northern', 39),
 ('by northern california', 38),
 ('china stock market', 36),
 ('stock market crash', 36),
 ('affected by the', 35),
 ('from mh370 malaysia', 34),
 ('more than 40', 34),
 ('than 40 families', 34)]

In [138]:
# plot
df2 = pd.DataFrame(top_trigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Reds', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top trigrams without stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

## Top trigrams after stopwords removal

In [139]:
def get_top_n_trigrams_sw_removed(corpus, n):
    vec = CountVectorizer(ngram_range=(3,3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x:x[1], reverse=True)
    return words_freq[:n]

In [140]:
top_trigrams = get_top_n_trigrams_sw_removed(df['tidy_tweet'], 20)
top_trigrams

[('suicide bomber detonated', 44),
 ('pkk suicide bomber', 42),
 ('bomber detonated bomb', 42),
 ('northern california wildfire', 41),
 ('16yr old pkk', 41),
 ('old pkk suicide', 41),
 ('latest homes razed', 39),
 ('homes razed northern', 39),
 ('razed northern california', 38),
 ('china stock market', 36),
 ('stock market crash', 36),
 ('40 families affected', 34),
 ('families affected fatal', 34),
 ('affected fatal outbreak', 34),
 ('watch airport swallowed', 34),
 ('airport swallowed sandstorm', 34),
 ('swallowed sandstorm minute', 34),
 ('detonated bomb turkey', 34),
 ('bomb turkey army', 34),
 ('turkey army trench', 34)]

In [141]:
# plot
df2 = pd.DataFrame(top_trigrams, columns=['word', 'freq'])

data = [go.Bar(
    x=df2['word'],
    y=df2['freq'],
    marker=dict(colorscale='Reds', color=df2.freq),
    text='Count'
)]

layout = go.Layout(
    title='Top trigrams after stopwords removal'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')