# Import Libraries

In [1]:
## Import Libraries and Dependencies ##
import pandas as pd
import numpy as np

# Defined Emoticons Symbols

In [2]:
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)

# Data Cleansing

In [3]:
INPUT_PATH = "D:/OneDrive - National University of Singapore/NUS MTech KE/MTech KE - FYP - InsureSense/Kang Jiang/Phase 3/System Implementation/scripts/Data Mining & Machine Learning/dataset/disaster_phase_text_classification/model/"

In [4]:
OUTPUT_PATH = "D:/OneDrive - National University of Singapore/NUS MTech KE/MTech KE - FYP - InsureSense/Kang Jiang/Phase 3/System Implementation/scripts/Data Mining & Machine Learning/dataset/sentiment_analysis/"

In [5]:
df_Japan_Floods_full = pd.read_csv(INPUT_PATH + "df_Japan_floods_phase_labelled.csv", index_col=0, encoding = "ISO-8859-1")
# df_Typhoon_Jebi_full = pd.read_csv(INPUT_PATH + 'df_Typhoon_Jebi_phase_labelled.csv', index_col=0)
# df_Typhoon_Mangkhut_full = pd.read_csv(INPUT_PATH + 'df_Typhoon_Mangkhut_phase_labelled.csv', index_col=0)

In [6]:
df_disaster_full = df_Japan_Floods_full
df_disaster_full = df_disaster_full[:10]

In [7]:
df_disaster_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 13 columns):
tweet_id          10 non-null float64
user              10 non-null object
timestamp         10 non-null object
date              10 non-null object
events            10 non-null object
text              10 non-null object
processed_text    10 non-null object
likes             10 non-null int64
replies           10 non-null int64
retweets          10 non-null int64
url               10 non-null object
disaster_flag     10 non-null int64
disaster_phase    10 non-null int64
dtypes: float64(1), int64(5), object(7)
memory usage: 1.1+ KB


In [8]:
df_disaster_full.head()

Unnamed: 0,tweet_id,user,timestamp,date,events,text,processed_text,likes,replies,retweets,url,disaster_flag,disaster_phase
0,1.00536e+18,@jjwalsh,6/9/2018 7:47,6/9/2018,Japan Floods,Nice day by the river in #Kobe on this beautif...,nice day river kobe beautiful sunny weather sa...,26,3,3,/jjwalsh/status/1005355731665629184,1,3
1,1.00537e+18,@metalheadbazaar,6/9/2018 8:29,6/9/2018,Japan Floods,Marduk - To Tour Japan In November - Metal Sto...,marduk tour japan november metal storm,0,0,0,/metalheadbazaar/status/1005366203618156546,1,3
2,1.00537e+18,@wordwidetroll,6/9/2018 8:45,6/9/2018,Japan Floods,Yestarday storm give me http://www.irvinakatec...,yestarday storm give ad adsense money moneygur...,0,0,0,/wordwidetroll/status/1005370364757725184,1,3
3,1.00537e+18,@kazuotamakashi,6/9/2018 9:00,6/9/2018,Japan Floods,"Rain tomorrow, at Nara City, Japan! With a hig...",rain tomorrow nara city japan high 22c low 18c,0,0,0,/kazuotamakashi/status/1005373968650571778,1,1
4,1.00537e+18,@shukyudo_travel,6/9/2018 9:00,6/9/2018,Japan Floods,Rain tomorrow! With a high of 79F and a low of...,rain tomorrow high 79f low 70f japan osaka tra...,0,0,0,/shukyudo_travel/status/1005373973142622210,1,1


# Sentiment Analysis

## Sentiment Score Labelling - Method 1

In [9]:
## Import Libraries and Dependencies ##
from nltk.corpus import twitter_samples
from nltk.tokenize import TweetTokenizer

import nltk
from nltk import word_tokenize

from nltk.corpus import stopwords 
stopwords_english = stopwords.words('english')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

import re
import string

In [10]:
# tweet cleaning function
def clean_tweets_1(tweet):

    wnlemma = nltk.WordNetLemmatizer()
    printable = set(string.printable)

    # remove non ASCII word
    tweet = ''.join(filter(lambda x: x in printable, tweet))
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    tweet = tweet.strip()
    
    tweet = re.sub(' +',' ', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            lemma_word = wnlemma.lemmatize(word)
            tweets_clean.append(lemma_word)
            
#     tweets_clean = " ".join(tweets_clean)
    return tweets_clean

In [11]:
# feature extractor function
def bag_of_words(tweet):
    words = clean_tweets_1(tweet)
    words_dictionary = dict([word, True] for word in words)    
    return words_dictionary

## Sentiment model building

In [12]:
pos_tweets = twitter_samples.strings('positive_tweets.json')

In [13]:
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [14]:
# positive tweets feature set
pos_tweets_set = []
for tweet in pos_tweets:
    pos_tweets_set.append((bag_of_words(tweet), 1))

In [15]:
# negative tweets feature set
neg_tweets_set = []
for tweet in neg_tweets:
    neg_tweets_set.append((bag_of_words(tweet), -1))

In [16]:
full_set = pos_tweets_set + neg_tweets_set

## Sentiment model classification

In [17]:
from nltk import classify
from nltk import NaiveBayesClassifier

In [18]:
# Train the sentiment model using open source training dataset
classifier = NaiveBayesClassifier.train(full_set)

In [19]:
# Classify disaster tweets into positive or negative tweets
df_disaster_full['sentiment_1'] = ""
for index, row in df_disaster_full.iterrows():
    
    tweet_text_set = bag_of_words(row['text'])
    df_disaster_full['sentiment_1'].iloc[index] = classifier.classify(tweet_text_set)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Sentiment Score Labelling - Method 2

In [20]:
# tweet cleaning function
def clean_tweets_2(tweet):
    
    wnlemma = nltk.WordNetLemmatizer()
    printable = set(string.printable)

    # remove non ASCII word
    tweet = ''.join(filter(lambda x: x in printable, tweet))
    
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)

    tweet = tweet.strip()
    
    tweet = re.sub(' +',' ', tweet)
    
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_english and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            lemma_word = wnlemma.lemmatize(word)
            tweets_clean.append(lemma_word)
            
    tweets_clean = " ".join(tweets_clean)
    return tweets_clean

## Sentiment polarity score labelling

In [21]:
df_disaster_full['processed_text'] = ""
for index, row in df_disaster_full.iterrows():
    df_disaster_full['processed_text'].iloc[index] = clean_tweets_2(row['text'])
df_disaster_full = df_disaster_full[['tweet_id','user','timestamp','date','events','text','processed_text','likes','replies','retweets','url','disaster_flag','disaster_phase','sentiment_1']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
# Import the library for sentiment analysis
from textblob import TextBlob

# For each of the comment, calculate the sentiment polarity and store in a new collumn as sentiment polarity score
df_disaster_full['sentiment_2'] = ""
for index, row in df_disaster_full.iterrows():
    blob = TextBlob(df_disaster_full['processed_text'].iloc[index])
    for sentence in blob.sentences:
        df_disaster_full['sentiment_2'].iloc[index] = sentence.sentiment.polarity

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


## Sentiment Score Labelling -  Method 3

In [23]:
import json, requests
class StanfordCoreNLP:
    """
    Modified from https://github.com/smilli/py-corenlp
    """
 
    def __init__(self, server_url):
        # TODO: Error handling? More checking on the url?
        if server_url[-1] == '/':
            server_url = server_url[:-1]
        self.server_url = server_url
 
    def annotate(self, text, properties=None):
        assert isinstance(text, str)
        if properties is None:
            properties = {}
        else:
            assert isinstance(properties, dict)
 
        # Checks that the Stanford CoreNLP server is started.
        try:
            requests.get(self.server_url)
        except requests.exceptions.ConnectionError:
            raise Exception('Check whether you have started the CoreNLP server e.g.\n'
                            '$ cd <path_to_core_nlp_folder>/stanford-corenlp-full-2016-10-31/ \n'
                            '$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port <port> -timeout <timeout_in_ms>')
 
        data = text.encode()
        r = requests.post(
            self.server_url, params={
                'properties': str(properties)
            }, data=data, headers={'Connection': 'close'})
        output = r.text
        if ('outputFormat' in properties
            and properties['outputFormat'] == 'json'):
            try:
                output = json.loads(output, encoding='utf-8', strict=True)
            except:
                pass
        return output

    
def sentiment_analysis_on_sentence(sentence):
    # The StanfordCoreNLP server is running on http://127.0.0.1:9000
    nlp = StanfordCoreNLP('http://127.0.0.1:9000')
    # Json response of all the annotations
    output = nlp.annotate(sentence, properties={
        "annotators": "tokenize,ssplit,parse,sentiment",
        "outputFormat": "json",
        # Only split the sentence at End Of Line. We assume that this method only takes in one single sentence.
        "ssplit.eolonly": "true",
        # Setting enforceRequirements to skip some annotators and make the process faster
        "enforceRequirements": "false"
    })
    # Only care about the result of the first sentence because we assume we only annotate a single sentence in this method.
    return int(output['sentences'][0]['sentimentValue'])

In [24]:
# df_disaster_full['sentiment_3'] = ""

# for index, row in df_disaster_full.iterrows():
#     df_disaster_full['processed_text'].iloc[index] = clean_tweets_2(row['text'])
#     df_disaster_full['sentiment_3'].iloc[index] = sentiment_analysis_on_sentence(row['processed_text'])    

## Ensemble Sentiment Model

In [25]:
df_disaster_full.head()

Unnamed: 0,tweet_id,user,timestamp,date,events,text,processed_text,likes,replies,retweets,url,disaster_flag,disaster_phase,sentiment_1,sentiment_2
0,1.00536e+18,@jjwalsh,6/9/2018 7:47,6/9/2018,Japan Floods,Nice day by the river in #Kobe on this beautif...,nice day river kobe beautiful sunny weather sa...,26,3,3,/jjwalsh/status/1005355731665629184,1,3,1,0.683333
1,1.00537e+18,@metalheadbazaar,6/9/2018 8:29,6/9/2018,Japan Floods,Marduk - To Tour Japan In November - Metal Sto...,marduk tour japan november metal storm,0,0,0,/metalheadbazaar/status/1005366203618156546,1,3,-1,0.0
2,1.00537e+18,@wordwidetroll,6/9/2018 8:45,6/9/2018,Japan Floods,Yestarday storm give me http://www.irvinakatec...,yestarday storm give ad adsense money moneygur...,0,0,0,/wordwidetroll/status/1005370364757725184,1,3,-1,0.0
3,1.00537e+18,@kazuotamakashi,6/9/2018 9:00,6/9/2018,Japan Floods,"Rain tomorrow, at Nara City, Japan! With a hig...",rain tomorrow nara city japan high 22c low 18c,0,0,0,/kazuotamakashi/status/1005373968650571778,1,1,-1,0.08
4,1.00537e+18,@shukyudo_travel,6/9/2018 9:00,6/9/2018,Japan Floods,Rain tomorrow! With a high of 79F and a low of...,rain tomorrow high 79f low 70f japan osaka tra...,0,0,0,/shukyudo_travel/status/1005373973142622210,1,1,-1,0.08


In [26]:
# Extract columns
df_disaster_full = df_disaster_full[['tweet_id','user','timestamp','date','events','text', 'processed_text','likes','replies','retweets','url','disaster_flag','disaster_phase','sentiment_1','sentiment_2']]
# df_disaster_full = df_disaster_full[['tweet_id','user','timestamp','date','events','text', 'processed_text','likes','replies','retweets','url','disaster_flag','disaster_phase','sentiment_1','sentiment_2','sentiment_3']]

In [27]:
# Convert sentiment score from method 1 and method 2 to float
df_disaster_full['sentiment_1'] = df_disaster_full['sentiment_1'].astype(float)
df_disaster_full['sentiment_2'] = df_disaster_full['sentiment_2'].astype(float)
# df_disaster_full['sentiment_3'] = df_disaster_full['sentiment_3'].astype(float)

In [28]:
# Normalize sentiment score from method 1 and method 2 to [-1,1]
df_disaster_full['sentiment_1_norm'] = \
    2*((df_disaster_full['sentiment_1']-df_disaster_full['sentiment_1'].min())/(df_disaster_full['sentiment_1'].max()-df_disaster_full['sentiment_1'].min())) - 1

df_disaster_full['sentiment_2_norm'] = \
    2*((df_disaster_full['sentiment_2']-df_disaster_full['sentiment_2'].min())/(df_disaster_full['sentiment_2'].max()-df_disaster_full['sentiment_2'].min())) - 1

# df_disaster_full['sentiment_3_norm'] = \
#     2*((df_disaster_full['sentiment_3']-df_disaster_full['sentiment_3'].min())/(df_disaster_full['sentiment_3'].max()-df_disaster_full['sentiment_3'].min())) - 1

In [29]:
# Calculate final sentiment score by taking teh average of sentiment score from method 1 and 2
df_disaster_full['sentiment_final'] = (df_disaster_full['sentiment_1_norm'] + df_disaster_full['sentiment_2_norm'])/2
# df_disaster_full['sentiment_final'] = (df_disaster_full['sentiment_1_norm'] + df_disaster_full['sentiment_2_norm'] + df_disaster_full['sentiment_3_norm'])/3

In [30]:
df_disaster_full.head()

Unnamed: 0,tweet_id,user,timestamp,date,events,text,processed_text,likes,replies,retweets,url,disaster_flag,disaster_phase,sentiment_1,sentiment_2,sentiment_1_norm,sentiment_2_norm,sentiment_final
0,1.00536e+18,@jjwalsh,6/9/2018 7:47,6/9/2018,Japan Floods,Nice day by the river in #Kobe on this beautif...,nice day river kobe beautiful sunny weather sa...,26,3,3,/jjwalsh/status/1005355731665629184,1,3,1.0,0.683333,1.0,1.0,1.0
1,1.00537e+18,@metalheadbazaar,6/9/2018 8:29,6/9/2018,Japan Floods,Marduk - To Tour Japan In November - Metal Sto...,marduk tour japan november metal storm,0,0,0,/metalheadbazaar/status/1005366203618156546,1,3,-1.0,0.0,-1.0,-1.0,-1.0
2,1.00537e+18,@wordwidetroll,6/9/2018 8:45,6/9/2018,Japan Floods,Yestarday storm give me http://www.irvinakatec...,yestarday storm give ad adsense money moneygur...,0,0,0,/wordwidetroll/status/1005370364757725184,1,3,-1.0,0.0,-1.0,-1.0,-1.0
3,1.00537e+18,@kazuotamakashi,6/9/2018 9:00,6/9/2018,Japan Floods,"Rain tomorrow, at Nara City, Japan! With a hig...",rain tomorrow nara city japan high 22c low 18c,0,0,0,/kazuotamakashi/status/1005373968650571778,1,1,-1.0,0.08,-1.0,-0.765854,-0.882927
4,1.00537e+18,@shukyudo_travel,6/9/2018 9:00,6/9/2018,Japan Floods,Rain tomorrow! With a high of 79F and a low of...,rain tomorrow high 79f low 70f japan osaka tra...,0,0,0,/shukyudo_travel/status/1005373973142622210,1,1,-1.0,0.08,-1.0,-0.765854,-0.882927


In [31]:
# Remove unused column
df_disaster_full = df_disaster_full.drop(columns=['sentiment_1', 'sentiment_2', 'sentiment_1_norm', 'sentiment_2_norm'])
# df_disaster_full = df_disaster_full.drop(columns=['sentiment_1', 'sentiment_2', 'sentiment_3', 'sentiment_1_norm', 'sentiment_2_norm', 'sentiment_3_norm'])

# Data Storage

In [32]:
# Store the sentiment label dataset
# df_disaster_full.to_csv(OUTPUT_PATH + "df_Typhoon_Jebi_full_sentiment_labelled.csv")