In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pprint
import re
from textblob import TextBlob

# plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import nltk

In [24]:
disaster = pd.read_csv('socialmedia-disaster-tweets-DFE.csv', encoding='ISO-8859-1')
disaster.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,choose_one,choose_one:confidence,choose_one_gold,keyword,location,text,tweetid,userid
0,778243823,True,golden,156,,Relevant,1.0,Relevant,,,Just happened a terrible car crash,1.0,
1,778243824,True,golden,152,,Relevant,1.0,Relevant,,,Our Deeds are the Reason of this #earthquake M...,13.0,
2,778243825,True,golden,137,,Relevant,1.0,Relevant,,,"Heard about #earthquake is different cities, s...",14.0,
3,778243826,True,golden,136,,Relevant,0.9603,Relevant,,,"there is a forest fire at spot pond, geese are...",15.0,
4,778243827,True,golden,138,,Relevant,1.0,Relevant,,,Forest fire near La Ronge Sask. Canada,16.0,


In [25]:
disaster.shape

(10876, 13)

In [26]:
df = disaster[['text', 'choose_one']]

In [27]:
df.head()

Unnamed: 0,text,choose_one
0,Just happened a terrible car crash,Relevant
1,Our Deeds are the Reason of this #earthquake M...,Relevant
2,"Heard about #earthquake is different cities, s...",Relevant
3,"there is a forest fire at spot pond, geese are...",Relevant
4,Forest fire near La Ronge Sask. Canada,Relevant


In [28]:
# Glimpse of text
def glimpse_text(target, a):
    res =  list(df[df['choose_one'] == target]['text'][:a])
    return res

In [29]:
print(glimpse_text('Relevant', 5))

['Just happened a terrible car crash', 'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all', 'Heard about #earthquake is different cities, stay safe everyone.', 'there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all', 'Forest fire near La Ronge Sask. Canada']


In [30]:
print(glimpse_text('Not Relevant', 5))

["They'd probably still show more life than Arsenal did yesterday, eh? EH?", 'Hey! How are you?', "What's up man?", 'I love fruits', 'Summer is lovely']


In [31]:
print(glimpse_text("Can't Decide", 5))

['Why is there an ambulance right outside my work', '@MisfitRarity misfit got bombed', '@RockBottomRadFM Is one of the challenges on Tough Enough rescuing people from burning buildings?', '? High Skies - Burning Buildings ? http://t.co/uVq41i3Kx2 #nowplaying', 'What if we used drones to help firefighters lead people out of burning buildings/ help put the fire out?']


In [32]:
# Target variable distribution
data = [go.Bar(
        x = df.choose_one.unique(),
        y = df.choose_one.value_counts().values,
        marker = dict(colorscale = 'Jet', color = df.choose_one.value_counts().values),
        text = 'Target Count'
)]

layout = go.Layout(
    title = 'Target Variable Distribution'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [33]:
# remove pattern function
def remove_pattern(input_text, pattern):
    r = re.findall(pattern=pattern, string=input_text)
    
    for i in r:
        input_text = re.sub(i, '', input_text)
    
    return input_text

In [34]:
# Removing all words starting with @
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['text'], '@[\w]*')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [35]:
# Removing special characters, punctuations, numbers, links
def clean_tweet(text_field):
    df[text_field] = df[text_field].str.replace(r'http\S+', '')
    df[text_field] = df[text_field].str.replace(r'http', '')
    df[text_field] = df[text_field].str.replace(r'https\S+', '')
    df[text_field] = df[text_field].str.replace(r'https', '')
    df[text_field] = df[text_field].str.replace("[^a-zA-Z0-9@\'\"]", " ")
    df[text_field] = df[text_field].str.replace(r'@', 'at')
    df[text_field] = df[text_field].str.lower()
    return df

In [36]:
df = clean_tweet('tidy_tweet')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A 

In [37]:
# Sentiment polarity, length, word count
df['polarity'] = df['tidy_tweet'].map(lambda text: TextBlob(text).sentiment.polarity)
df['review_len'] = df['tidy_tweet'].astype(str).apply(len)
df['word_count'] = df['tidy_tweet'].apply(lambda x: len(str(x).split()))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [38]:
df.head()

Unnamed: 0,text,choose_one,tidy_tweet,polarity,review_len,word_count
0,Just happened a terrible car crash,Relevant,just happened a terrible car crash,-1.0,34,6
1,Our Deeds are the Reason of this #earthquake M...,Relevant,our deeds are the reason of this earthquake m...,0.0,69,13
2,"Heard about #earthquake is different cities, s...",Relevant,heard about earthquake is different cities s...,0.25,64,9
3,"there is a forest fire at spot pond, geese are...",Relevant,there is a forest fire at spot pond geese are...,0.0,96,19
4,Forest fire near La Ronge Sask. Canada,Relevant,forest fire near la ronge sask canada,0.1,38,7


In [45]:
# random tweets with highest positive sentiment polarity
print(df[df['polarity'] == 1.00]['tidy_tweet'].sample(5).values)

["check out this awesome profile on  ge's swimming  robot used in  nuclear reactors    innovation "
 "'cause you play me like a symphony play me till your fingers bleed  i'm your greatest masterpiece  you ruin me  "
 "  d  what  that's a tragedy  you have a wonderful nose"
 ' fatloss  diet how can you find the best ways to reduce weight    thunder  health'
 '  fettilootch is  slanglucci oppressions greatest danger coming soon the album  ']


In [46]:
# random tweets with neutral sentiment polarity
print(df[df['polarity'] == 0.00]['tidy_tweet'].sample(5).values)

['sinkhole swallows brooklyn intersection     video  '
 'alabama home quarantined over possible ebola case  officials say a quarantine is in place at       bluehand  pjnet'
 'valley building evacuated after fire   queensland'
 'do you feel like you are sinking in unhappiness  take the quiz   '
 'twister hits 4 villages in quezon province      quezon  news']


In [44]:
# random tweets with highest negative sentiment polarity
print(df[df['polarity'] == -1.00]['tidy_tweet'].sample(5).values)

['remembering  hiroshima 70 years on  hundreds of thousands of lives obliterated 70 years of pain devastating    '
 'omg horrible accident man died in wings of airplane  '
 'the worst part is seeing lightning and trying to guess when the thunder will crack'
 ' horrible  accident man died in wings airplane  29 07 2015   watchthevideo '
 'horrible accident  man died in wings of airplane   29 07 2015  ']
