In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pprint
import re
from textblob import TextBlob

# plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

import nltk

In [2]:
disaster = pd.read_csv('socialmedia-disaster-tweets-DFE.csv', encoding='ISO-8859-1')
disaster.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [3]:
disaster.shape

(10876, 13)

In [4]:
df = disaster[['text', 'choose_one']]

In [5]:
df.head()

Unnamed: 0,text,choose_one
0,Just happened a terrible car crash,Relevant
1,Our Deeds are the Reason of this #earthquake M...,Relevant
2,"Heard about #earthquake is different cities, s...",Relevant
3,"there is a forest fire at spot pond, geese are...",Relevant
4,Forest fire near La Ronge Sask. Canada,Relevant


In [6]:
# Glimpse of text
def glimpse_text(target, a):
    res =  list(df[df['choose_one'] == target]['text'][:a])
    return res

In [7]:
print(glimpse_text('Relevant', 5))

['Just happened a terrible car crash', 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Heard about #earthquake is different cities, stay safe everyone.', 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all', 'Forest fire near La Ronge Sask. Canada']


In [8]:
print(glimpse_text('Not Relevant', 5))

["They'd probably still show more life than Arsenal did yesterday, eh? EH?", 'Hey! How are you?', "What's up man?", 'I love fruits', 'Summer is lovely']


In [9]:
print(glimpse_text("Can't Decide", 5))

['Why is there an ambulance right outside my work', '@MisfitRarity misfit got bombed', '@RockBottomRadFM Is one of the challenges on Tough Enough rescuing people from burning buildings?', '? High Skies - Burning Buildings ? http://t.co/uVq41i3Kx2 #nowplaying', 'What if we used drones to help firefighters lead people out of burning buildings/ help put the fire out?']


In [10]:
# Target variable distribution
data = [go.Bar(
        x = df.choose_one.unique(),
        y = df.choose_one.value_counts().values,
        marker = dict(colorscale = 'Jet', color = df.choose_one.value_counts().values),
        text = 'Target Count'
)]

layout = go.Layout(
    title = 'Target Variable Distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [11]:
# remove pattern function
def remove_pattern(input_text, pattern):
    r = re.findall(pattern=pattern, string=input_text)
    
    for i in r:
        input_text = re.sub(i, '', input_text)
    
    return input_text

In [12]:
# Removing all words starting with @
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['text'], '@[\w]*')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [13]:
# Removing special characters, punctuations, numbers, links
def clean_tweet(text_field):
    df[text_field] = df[text_field].str.replace(r'http\S+', '')
    df[text_field] = df[text_field].str.replace(r'http', '')
    df[text_field] = df[text_field].str.replace(r'https\S+', '')
    df[text_field] = df[text_field].str.replace(r'https', '')
    df[text_field] = df[text_field].str.replace("[^a-zA-Z0-9@\'\"]", " ")
    df[text_field] = df[text_field].str.replace(r'@', 'at')
    df[text_field] = df[text_field].str.lower()
    return df

In [14]:
df = clean_tweet('tidy_tweet')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

In [15]:
# Sentiment polarity, length, word count
df['polarity'] = df['tidy_tweet'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['tidy_tweet'].astype(str).apply(len)
df['word_count'] = df['tidy_tweet'].apply(lambda x: len(str(x).split()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [16]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake M...,Relevant,our deeds are the reason of this earthquake m...,0.0,69,13
2,"Heard about #earthquake is different cities, s...",Relevant,heard about earthquake is different cities s...,0.25,64,9
3,"there is a forest fire at spot pond, geese are...",Relevant,there is a forest fire at spot pond geese are...,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7


In [17]:
# random tweets with highest positive sentiment polarity
print(df[df['polarity'] == 1.00]['tidy_tweet'].sample(5).values)

['you r a wonderful person         thomasistrash '
 " honey you ain't no angel  you like to scream these words as a weapon  well go ahead take your best shot woman  i wanna leave you it's"
 '  thank you again    ir truly wonderful'
 "i would rather dwell in the land of famine and be in god's perfect will than to rest in the confines of egypt    "
 '  and if your best evidence is the word of a guy who encouraged suicide bombing as a means to get to heaven     well  ']


In [18]:
# random tweets with neutral sentiment polarity
print(df[df['polarity'] == 0.00]['tidy_tweet'].sample(5).values)

["that exploded  amp  brought about the beginning of universe matches what's mentioned in the versethe heaven and earth  thus the universe "
 "just woke up to the loudest thunderstorm i've ever heard"
 'woman sneaks into airplane cockpit  terrorism not suspected  '
 'man goes into airplane engine accident   via '
 'the hobbit  the desolation of smaug   dvd 2014 2 disc set digital copy   ']


In [19]:
# random tweets with highest negative sentiment polarity
print(df[df['polarity'] == -1.00]['tidy_tweet'].sample(5).values)

['horrific attack on wife by muslim in italy  liveleak  news'
 'yay the evil is being destroyed  '
 'ashes 2015  australia   s collapse at trent bridge among worst in history  england bundled out australia for 60     '
 'remembering  hiroshima 70 years on  hundreds of thousands of lives obliterated 70 years of pain devastating    '
 'my license picture blown up is absolutely terrifying  ']


## Dist. plot of polarity score

In [67]:
data = [go.Histogram(
        x = df.polarity,
        marker=dict(color='rgba(12, 98, 12, 0.6)')
)]

layout = go.Layout(
    title='Polarity Score Distribution',
    xaxis=dict(title='Polarity Score'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

- Vast majority of tweets have a polarity score of 0.

## Review length distribution

In [69]:
data = [go.Histogram(
        x = df.review_len,
        marker=dict(color='rgba(52, 98, 52, 0.6)')
)]

layout = go.Layout(
    title='Review Length Distribution',
    xaxis=dict(title='Review Length'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

## Word Length Distribution

In [70]:
data = [go.Histogram(
        x = df.word_count,
        marker=dict(color='rgba(52, 48, 10, 0.6)')
)]

layout = go.Layout(
    title='Word Length Distribution',
    xaxis=dict(title='Word Count'),
    yaxis=dict(title='Count')
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-hist')

In [71]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake M...,Relevant,our deeds are the reason of this earthquake m...,0.0,69,13
2,"Heard about #earthquake is different cities, s...",Relevant,heard about earthquake is different cities s...,0.25,64,9
3,"there is a forest fire at spot pond, geese are...",Relevant,there is a forest fire at spot pond geese are...,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7
