# Information Retrieval Final Project
## TweetQuake
Detection of Earthquake using Twitter data.
Twitter users are everywhere, check the twitter map and earthquake frequency map
<img src="img/TwitterMap.png" alt="Drawing" style="width: 475px;float:left; margin-top: 30px" title = "Twitter Map"/>
<img src="img/EarthQuakeMap.png" alt="Drawing" style="width: 475px;float:right" title = "Earthquake Map"/>
<img src="img/EarthQuakeTweet.png" alt="Drawing" style="width: 600px;" title = "Earthquake Tweet Frequency"/>

In [0]:
%matplotlib inline
import re
import csv
import nltk
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from collections import defaultdict
from IPython.core.interactiveshell import InteractiveShell
from sklearn.feature_extraction.text import TfidfVectorizer
InteractiveShell.ast_node_interactivity = "all"
from nltk.stem.porter import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

def get_data(filename):
    df = pd.read_csv(filename)
    return df

def length(df):
    return len(df['Tweet_Text'])


data = get_data('2013_Bohol_earthquake-tweets_labeled.csv')
data['Info'] = 'related'
data.Info[('Not related' == data.Informativeness)] = 'not-related'
data['Tweet_Text'] = data['Tweet_Text'].apply(lambda x: x.decode('unicode_escape').\
                                          encode('ascii', 'ignore').\
                                          strip().lower())

X1 = data[['Tweet_ID','Tweet_Text','Info']]
y1 = data.Info
porter_stemmer = PorterStemmer()

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
wordmap = defaultdict(int)

def Stopword(tweet):
    nostop = []
    for word in tweet:
        #word = word.decode('utf-8')
        if word not in stopwords: nostop.append(word)
    return ' '.join(nostop)

def remove_stopword (X):
    X['nostopword'] = X.Tweet_Text.str.split().apply(Stopword)
    return X

def Porter_Stem(tweet):
    stemmed_word = []
    for word in tweet:
        word = word.decode('utf-8')
        stemmed_word.append(porter_stemmer.stem(word)) 
    return ' '.join(stemmed_word)
            
def stemming (X):
    X['stem'] = X.nostopword.str.split().apply(Porter_Stem)
    return X

def update_wordmap(tweet):
    #update wordmap
    for word in tweet:
        wordmap[word]+=1

def term_frequency_plot(X):
    #Plot a graph for term frequency in tweets
    X.stem.str.split().apply(update_wordmap)
    sorted_x = sorted(wordmap.items(), key=operator.itemgetter(1), reverse = True)
    objects = list()
    freq = list()
    for i in range(10):
        objects.append(sorted_x[i][0])
        freq.append(sorted_x[i][1])

    x_pos = np.arange(len(objects))
    plt.barh(x_pos, freq, align='center', alpha=0.5)
    plt.xlabel("Frequency")
    plt.yticks(x_pos, objects)
    plt.title('Term Frequency usage')
    plt.show()
    
def Feature_extraction_A(X):
    X['total_words'] = X.stem.str.split(' ').apply(len)
    X['position_query_word'] = X.stem.str.split().apply(find_position)
    return X

def Feature_extraction_BnC(X):
    word_features = X[['Tweet_ID','Tweet_Text','Info']]
    word_features = word_features.values.tolist()

    data_pos = []
    data_neg = []

    for tweet in word_features:
        if tweet[2] == 'related':
            data_pos.append(tweet[1])
        else:
            data_neg.append(tweet[1])

    token_pattern = r'\w+|[%s]' % string.punctuation

    vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                 token_pattern=token_pattern,
                                 binary=True,
                                max_features=5000)
    word_vector = vectorizer.fit_transform(data_pos+data_neg)
    
    return X.join(pd.DataFrame(word_vector.toarray()))

def find_position(val):
    for i in range(len(val)):
        if val[i].lower().find(porter_stemmer.stem('earthquake')) != -1:
            return i
    return -1


X1 = remove_stopword(X1)
X1 = stemming (X1)
term_frequency_plot(X1)
X1 = Feature_extraction_A(X1)
# X1 = Feature_extraction_BnC(X1)

y1 = data['Info'].values
X1 = X1.drop('Tweet_ID',axis=1)
X1 = X1.drop('Tweet_Text',axis=1)
X1 = X1.drop('nostopword',axis=1)
X1 = X1.drop('stem',axis=1)
X1 = X1.drop('Info',axis=1)
X1 = X1.values
# X

In [0]:
type(X1)
type(y1)

## SVM Model Implementation

In [0]:
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import train_test_split
import sklearn.metrics


scaler1 = preprocessing.StandardScaler().fit(X1)
X1 = scaler1.transform(X1)

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.20, random_state=42)

In [0]:
C_range = np.logspace(-2, 10, 5, base=2)
gamma_range = np.logspace(-9, 1, 5, base=2)
# k_options = ['linear','poly','rbf']
params_grids = dict(gamma=gamma_range, C=C_range)
grid1 = GridSearchCV(SVC('rbf'), param_grid=params_grids, cv=10)
grid1.fit(X1_train,y1_train)

print grid1.best_params_
# grid1.grid_scores_

In [0]:
# y1_pred = SVC(kernel=grid1.best_params_['kernel'] ,C=grid1.best_params_['C'], 
#              gamma=grid1.best_params_['gamma']).fit(X1_train,y1_train).predict(X1_test)

clf1 = SVC(kernel='rbf' ,C=grid1.best_params_['C'], 
             gamma=grid1.best_params_['gamma'])

clf1.fit(X1_train,y1_train)
y1_pred = clf1.predict(X1_test)

sklearn.metrics.accuracy_score(y1_test, y1_pred)
sklearn.metrics.precision_score(y1_test, y1_pred,pos_label='related')
sklearn.metrics.recall_score(y1_test, y1_pred,pos_label='related')
sklearn.metrics.f1_score(y1_test, y1_pred,pos_label='related')

## Naive-Bayes SVM Implementation

The Naive-Bayes SVM (NBSVM) is a simple but novel SVM variant using NB log-count ratios as feature values and is supposed to be a robust performer. This model is an interpolation between MNB and SVM, which can be seen as a form of regularization: trust NB unless the SVM is very confident. 

The concept of NBSVM has been obtained from this research paper : Wang, Sida, and Christopher D. Manning. "Baselines and bigrams: Simple, good sentiment and topic classification." Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics: Short Papers-Volume 2. Association for Computational Linguistics, 2012.

The class implementation of the NBSVM has been obtained from this repository: https://github.com/Joshua-Chin/nbsvm.git

For the first classification, where we are classifying earthquake-relevant tweets from earthquake-irrelevant tweets, we are using the NBSVM classifier as it's giving us a higher accuracy compared to the SVM classifier discussed above. 

In [0]:
from scipy.sparse import spmatrix, coo_matrix
from sklearn.base import BaseEstimator
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.svm import LinearSVC

class NBSVM(BaseEstimator, LinearClassifierMixin, SparseCoefMixin):

    def __init__(self, alpha=1, C=1, beta=0.25, fit_intercept=False):
        self.alpha = alpha
        self.C = C
        self.beta = beta
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        if len(self.classes_) == 2:
            coef_, intercept_ = self._fit_binary(X, y)
            self.coef_ = coef_
            self.intercept_ = intercept_
        else:
            coef_, intercept_ = zip(*[
                self._fit_binary(X, y == class_)
                for class_ in self.classes_
            ])
            self.coef_ = np.concatenate(coef_)
            self.intercept_ = np.array(intercept_).flatten()
        return self

    def _fit_binary(self, X, y):
        p = np.asarray(self.alpha + X[y == 1].sum(axis=0)).flatten()
        q = np.asarray(self.alpha + X[y == 0].sum(axis=0)).flatten()
        p = np.asarray(p,dtype=np.float)
        q = np.asarray(q,dtype=np.float)
        r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
        b = np.log((y == 1).sum()) - np.log((y == 0).sum())

        if isinstance(X, spmatrix):
            indices = np.arange(len(r))
            r_sparse = coo_matrix(
                (r, (indices, indices)),
                shape=(len(r), len(r))
            )
            X_scaled = X * r_sparse
        else:
            X_scaled = X * r

        lsvc = LinearSVC(
            C=self.C,
            fit_intercept=self.fit_intercept,
            max_iter=10000
        ).fit(X_scaled, y)

        mean_mag =  np.abs(lsvc.coef_).mean()

        coef_ = (1 - self.beta) * mean_mag * r + \
                self.beta * (r * lsvc.coef_)

        intercept_ = (1 - self.beta) * mean_mag * b + \
                     self.beta * lsvc.intercept_

        return coef_, intercept_

Here we are diving the data into a train-test split of 80-20. We are building our vocabulary based on the training data using scikit-learn's method TfidfVectorizer. We are using unigram, bigram and trigram features and the Tf-Idf weighting scheme on the word vector. We are setting the tf term in tf-idf to be binary, as it was increasing our accuracy.

In [0]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer

print("Vectorizing Training Text")

X2 = data[['Tweet_ID','Tweet_Text','Info']]
X2 = X2.values.tolist()

data_pos = []
data_neg = []

for tweet in X2:
    if tweet[2] == 'related':
        data_pos.append(tweet[1])
    else:
        data_neg.append(tweet[1])

train_pos = data_pos[:1100]
train_neg = data_neg[:1100]

token_pattern = r'\w+|[%s]' % string.punctuation

vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                             token_pattern=token_pattern,
                             binary=True)

X2_train = vectorizer.fit_transform(train_pos+train_neg)
y2_train = np.array([1]*len(train_pos)+[0]*len(train_neg))

print("Vocabulary Size: %s" % len(vectorizer.vocabulary_))
print("Vectorizing Testing Text")

test_pos = data_pos[1100:]
test_neg = data_neg[1100:]

X2_test = vectorizer.transform(test_pos+test_neg)
y2_test = np.array([1]*len(test_pos)+[0]*len(test_neg))

print("Fitting Model")

mnbsvm = NBSVM()
mnbsvm.fit(X2_train, y2_train)
print('Test Accuracy: %s' % mnbsvm.score(X2_test, y2_test))

## Classification Part 2

In [0]:
tweet_sensing_data = pd.read_csv('Earthquake_sensing_tweets.csv',header=0,delimiter=',')

In [0]:
def find_url(tweet):
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
    if len(urls) !=0:
        return 1
    else:
        return 0
    
def has_magnitude(tweet):
    decimal = re.findall("\d+\.\d\s", tweet)
    if len(decimal) != 0:
        return 1
    else:
        return 0

In [0]:
tweet_sensing_data['has_magn']= tweet_sensing_data.Tweet_Text.apply(has_magnitude)
tweet_sensing_data['has_url']= tweet_sensing_data.Tweet_Text.apply(find_url)

tweet_sensing_data = remove_stopword(tweet_sensing_data)
tweet_sensing_data = stemming (tweet_sensing_data)
tweet_sensing_data = Feature_extraction_A(tweet_sensing_data)

tweet_sensing_data = tweet_sensing_data.drop('nostopword',axis=1)
tweet_sensing_data = tweet_sensing_data.drop('stem',axis=1)
# tweet_sensing_data

In [0]:
dataArray = tweet_sensing_data.values
X3 = dataArray[:,3:]
X3 = np.array(X3, dtype='float')
y3 = dataArray[:,2]
y3 = np.array(y3, dtype='float')

scaler2 = preprocessing.StandardScaler().fit(X3)
X3 = scaler2.transform(X3)

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.20, random_state=42)

In [0]:
C_range = np.logspace(-2, 10, 5, base=2)
gamma_range = np.logspace(-9, 1, 5, base=2)
k_options = ['linear','poly','rbf']
params_grids = dict(gamma=gamma_range, C=C_range, kernel=k_options)
grid2 = GridSearchCV(SVC(), param_grid=params_grids, cv=10)
grid2.fit(X3_train,y3_train)

print grid2.best_params_

In [0]:
# y3_pred = SVC(kernel=grid2.best_params_['kernel'] ,C=grid2.best_params_['C'], 
#              gamma=grid2.best_params_['gamma']).fit(X3_train,y3_train).predict(X3_test)

clf2 =  SVC(kernel=grid2.best_params_['kernel'] ,C=grid2.best_params_['C'], 
             gamma=grid2.best_params_['gamma'])
clf2.fit(X3_train,y3_train)
y3_pred = clf2.predict(X3_test)

sklearn.metrics.accuracy_score(y3_test, y3_pred)
sklearn.metrics.precision_score(y3_test, y3_pred)
sklearn.metrics.recall_score(y3_test, y3_pred)
sklearn.metrics.f1_score(y3_test, y3_pred)

## Querying twitter real-time

For querying twitter real time, we are using the python 'tweepy' module. We are extracting 50 live streaming tweets from twitter and saving them in a 'live_data.csv' file for later classificaiton. To this 'live_data.csv' file, we are initially appending a set of 10 tweets, out of which 5 of them indicate a current earthquake and the other 5 of them, though related to earthquake, do not indicate a current earthquake. Our classifier needs to be able to detect the tweets which indicate only a current earthquake.

In [0]:
#Import the necessary methods from tweepy library
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json

#Variables that contains the user credentials to access Twitter API 
access_token = '859531338973622277-fdJp7rien3doiULaof2DwcLwIzngo6k'
access_token_secret = 'h3hAd5kzn9qRngThgQyRm9t2p1ErZH1orpAQ4HA15dlG9'
consumer_key = 'wBJ2csLxjMCz0hRwWF7Pw826z'
consumer_secret ='Au0OUtezSVY5VjKrBo9XTz9HJRHIQw2dJPtAVA4K1qBZgGGfh2'


#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
    
    def __init__(self, api=None):
        super(StdOutListener, self).__init__()
        self.num_tweets = 0
        self.f = csv.writer(open("live_data.csv", "wb+"))
        self.f.writerow(["Tweet_Text"])
        self.f.writerow(["Are we having an earthquake?"])
        self.f.writerow(["EARTHQUAKE?"])
        self.f.writerow(["It shook like crazy #earthquake"])
        self.f.writerow(["WOAHHHHH that was my first earthquake!!!!!"])
        self.f.writerow(["Is it just me or was that an earthquake?"])
        self.f.writerow(["An earthquake of mag 8.2 shook Delhi yesterday!!"])
        self.f.writerow(["RT biggest earthquake in last ten years!! Mag 9.1 richter reported"])
        self.f.writerow(["Attending an earthquake conference today."])
        self.f.writerow(["Japan has frequent earthquakes."])
        self.f.writerow(["Which is worse? Earthquake of 7.2 or 8.1?"])
        

    def on_data(self, data):
        if self.num_tweets < 50:
#             print self.num_tweets
            tweet_data = json.loads(data)
            if 'text' in tweet_data and tweet_data['lang'] == 'en':
                self.f.writerow([tweet_data['text'].encode('utf-8')])
                self.num_tweets += 1
            return True
        else:
            print('Done extracting')
            return False

    def on_error(self, status):
        print status


def get_tweets():

    #This handles Twitter authetification and the connection to Twitter Streaming API
    l = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    stream = Stream(auth, l)

    #This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
    stream.filter(locations=[-180,-90,180,90])
    
get_tweets()

In [0]:
live_data = pd.read_csv('live_data.csv',header=0)

for tweet in live_data.values.tolist():
    temp_tweet = tweet[0]
    temp_tweet_vector = vectorizer.transform([temp_tweet])
    if mnbsvm.predict(temp_tweet_vector)[0] == 1:
#         print temp_tweet
        tweet_df = pd.DataFrame([temp_tweet],columns=['Tweet_Text'])
        tweet_df['has_magn']= tweet_df.Tweet_Text.apply(has_magnitude)
        tweet_df['has_url']= tweet_df.Tweet_Text.apply(find_url)

        tweet_df = remove_stopword(tweet_df)
        tweet_df = stemming (tweet_df)
        tweet_df = Feature_extraction_A(tweet_df)

        tweet_df = tweet_df.drop('nostopword',axis=1)
        tweet_df = tweet_df.drop('stem',axis=1)
#         print tweet_df
        tempArray = tweet_df.values
        temp_X = tempArray[:,1:]
        temp_X = np.array(temp_X, dtype='float')

        temp_X = scaler2.transform(temp_X)
        temp_y = clf2.predict(temp_X)
        if temp_y == 1:
            print temp_tweet