# Information Retrieval Final Project
## TweetQuake
Detection of Earthquake using Twitter data.
Twitter users are everywhere, check the twitter map and earthquake frequency map
<img src="img/TwitterMap.png" alt="Drawing" style="width: 475px;float:left; margin-top: 30px" title = "Twitter Map"/>
<img src="img/EarthQuakeMap.png" alt="Drawing" style="width: 475px;float:right" title = "Earthquake Map"/>
<img src="img/EarthQuakeTweet.png" alt="Drawing" style="width: 600px;" title = "Earthquake Tweet Frequency"/>

In [0]:
import re
import csv
import nltk
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from collections import defaultdict
from IPython.core.interactiveshell import InteractiveShell
from sklearn.feature_extraction.text import TfidfVectorizer
InteractiveShell.ast_node_interactivity = "all"
from nltk.stem.porter import PorterStemmer
%matplotlib inline

def get_data(filename):
    df = pd.read_csv(filename)
    return df
def length(df):
    return len(df['Tweet_Text'])


data = get_data('2013_Bohol_earthquake-tweets_labeled.csv')
data['Info'] = 'related'
data.Info[('Not related' == data.Informativeness)] = 'not-related'
data['Tweet_Text'] = data['Tweet_Text'].apply(lambda x: x.decode('unicode_escape').\
                                          encode('ascii', 'ignore').\
                                          strip().lower())

# data.head()
X = data[['Tweet_ID','Tweet_Text','Info']]
y = data.Info
porter_stemmer = PorterStemmer()

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
wordmap = defaultdict(int)

def Stopword(tweet):
    nostop = []
    for word in tweet:
        #word = word.decode('utf-8')
        if word not in stopwords: nostop.append(word)
    return ' '.join(nostop)

def remove_stopword (X):
    X['nostopword'] = X.Tweet_Text.str.split().apply(Stopword)
    return X

def Porter_Stem(tweet):
    stemmed_word = []
    for word in tweet:
        word = word.decode('utf-8')
        stemmed_word.append(porter_stemmer.stem(word)) 
    return ' '.join(stemmed_word)
            
def stemming (X):
    X['stem'] = X.nostopword.str.split().apply(Porter_Stem)
    return X

def update_wordmap(tweet):
    #update wordmap
    for word in tweet:
        wordmap[word]+=1

def term_frequency_plot(X):
    #Plot a graph for term frequency in tweets
    X.stem.str.split().apply(update_wordmap)
    sorted_x = sorted(wordmap.items(), key=operator.itemgetter(1), reverse = True)
    objects = list()
    freq = list()
    for i in range(10):
        objects.append(sorted_x[i][0])
        freq.append(sorted_x[i][1])

    x_pos = np.arange(len(objects))
    plt.barh(x_pos, freq, align='center', alpha=0.5)
    plt.xlabel("Frequency")
    plt.yticks(x_pos, objects)
    plt.title('Term Frequency usage')
    plt.show()
    
def Feature_extraction_A(X):
    X['total_words'] = X.Tweet_Text.str.split(' ').apply(len)
    X['position_query_word'] = X.Tweet_Text.str.split().apply(find_position)
    return X

def Feature_extraction_B(X):
    word_features = X[['Tweet_ID','Tweet_Text','Info']]
    word_features = word_features.values.tolist()

    data_pos = []
    data_neg = []

    for tweet in word_features:
        if tweet[2] == 'related':
            data_pos.append(tweet[1])
        else:
            data_neg.append(tweet[1])

    token_pattern = r'\w+|[%s]' % string.punctuation

    vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                 token_pattern=token_pattern,
                                 binary=True,
                                max_features=5000)
    word_vector = vectorizer.fit_transform(data_pos+data_neg)
    
    return X.join(pd.DataFrame(word_vector.toarray()))

def Feature_extraction_C(X):
#     X['feature_c'] = X.Tweet_Text.str.split().apply(find_before_after_query_word)
    
    return X

def remove_punc(value):
    punctuation_marks = re.compile(r'[.?!,":;#-]')
    words = []
    for each in value:
        words.append(punctuation_marks.sub('',each))
    return ','.join(words)

def find_position(val):
    for i in range(len(val)):
        if val[i].lower().find(porter_stemmer.stem('earthquake')) != -1:
            return i
    return -1

def find_before_after_query_word(val):
    for i in range(len(val)):
        if val[i].lower().find('earthquake') != -1:
            if i == 0 and len(val)>1:
                return ','+val[i+1]
            elif i == len(val)-1 and len(val)>1:
                return val[i-1]+','
            else:
                return val[i-1]+','+val[i+1]
    return ', '

X = remove_stopword(X)
X = stemming (X)
term_frequency_plot(X)
X = Feature_extraction_A(X)
# X = Feature_extraction_B(X)
# X = Feature_extraction_C(X)
# X

y = data['Info'].values
X = X.drop('Tweet_ID',axis=1)
X = X.drop('Tweet_Text',axis=1)

X = X.drop('nostopword',axis=1)
X = X.drop('stem',axis=1)


X = X.drop('Info',axis=1)
# X

## Integration of Twitter API
Keys are for reference 
We can get new data set using this
or modify the old data set to get some new information

In [0]:
#Import the necessary methods from tweepy library
import tweepy
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json

#Variables that contains the user credentials to access Twitter API 
access_token = '859531338973622277-fdJp7rien3doiULaof2DwcLwIzngo6k'
access_token_secret = 'h3hAd5kzn9qRngThgQyRm9t2p1ErZH1orpAQ4HA15dlG9'
consumer_key = 'wBJ2csLxjMCz0hRwWF7Pw826z'
consumer_secret ='Au0OUtezSVY5VjKrBo9XTz9HJRHIQw2dJPtAVA4K1qBZgGGfh2'


#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
    
    def __init__(self, api=None):
        super(StdOutListener, self).__init__()
        self.num_tweets = 0

    def on_data(self, data):
        print self.num_tweets
        if self.num_tweets < 5:
            print json.loads(data)['id']
            self.num_tweets += 1
            return True
        else:
            return False

    def on_error(self, status):
        print status


def get_tweets():

    #This handles Twitter authetification and the connection to Twitter Streaming API
    l = StdOutListener()
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    print tweepy.API(auth).get_user('twitter').screen_name
    print tweepy.API(auth).me().name
    stream = Stream(auth, l)

    #This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
    stream.filter(track=['night'])
#     stream.filter()
    
get_tweets()
# print tweepy.API(auth).get_user('twitter').screen_name

## SVM Model Implementation

In [0]:
## SVM model implementation
import math
from sklearn import svm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer


# vec = DictVectorizer()
# df = vec.fit_transform(X.feature_a.str).toarray()
# X = pd.get_dummies(X.feature_b.str.split())

def standardizedX(X):
    scaler = StandardScaler().fit(X)
    standardizedX = scaler.transform(X)
    return standardizedX
# Tune hyperparameter gamma and choose best gamma for model training
def hyperparameter_tuning(X, y):
	# Choose value of hyper parameter from below values of gamma
    gammas = [2**-1, 2**-3, 2**-5, 2**-7, 2**-9]
    classifier = GridSearchCV(estimator=svm.SVR(), cv=10, param_grid=dict(gamma=gammas))

    kf = KFold(n_splits=10, random_state=None, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        classifier.fit(X_train, y_train)
    return classifier

# 10- fold cross validation and error evaluation
# Loop of 30 to see 300 validations with shuffle on and off
# Loop of 10 to check the confidence interval - paper doesn't describe much on the 
# confidence interval. This works only for shuffle On.
def cross_validation_evaluation(X, y):
    mean_error, mad_error = 0, 0
    count = 0
    mean_min, mad_min = 100, 100
    mean_max, mad_max = 0, 0
    f1_sco,accuracy_sco = 0,0
#     classifier = hyperparameter_tuning(X, y)
    model=svm.SVC(kernel='linear', gamma=0.001)#classifier.best_estimator_.gamma)
    
    for j in range(1):
        for i in range(1):
            kf = KFold(n_splits=10, random_state=None, shuffle=True)
#             print len(X)
            for train_index, test_index in kf.split(X):
                count += 1
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]
                model.fit(X_train,y_train)
#                 mean_error += mean_squared_error(y_test, model.predict(X_test))
#                 mad_error += mean_absolute_error(y_test,model.predict(X_test))
#                 print (y_test,model.predict(X_test))
#                 print (model.predict(X_test))
                f1_sco += f1_score(y_test,model.predict(X_test),pos_label='related')
                accuracy_sco += accuracy_score(y_test,model.predict(X_test),normalize=True)
#                 print classification_report(y_test,model.predict(X_test))
#         mean_min = min(mean_min, (mean_error/count)**0.5)
#         mean_max = max(mean_max, (mean_error/count)**0.5)
#         mad_min = min(mad_min, (mad_error/count))
#         mad_max = max(mad_max, (mad_error/count))
#     RMSE = (mean_error/count)**0.5
#     MAD = mad_error/count
    print 'F1-score: ' + str(f1_sco/count)
    print 'Accuracy: ' + str(accuracy_sco/count)
#     return RMSE, MAD, mean_min, mean_max, mad_min, mad_max
    

# X = standardizedX(X)
# print X.tail()
cross_validation_evaluation(X.values, y)
# RMSE, MAD, mean_min, mean_max, mad_min, mad_max = cross_validation_evaluation(X.values, y)
# print "For feature: "
# print 'RMSE +- Confidence Interval: '+"{0:.2f}".format(RMSE)+' +- '+"{0:.2f}".format(max(RMSE-mean_min, mean_max-RMSE))
# print 'MAD +- Confidence Interval: '+"{0:.2f}".format(MAD)+' +- '+"{0:.2f}".format(max(MAD-mad_min, mad_max-MAD))

## Naive-Bayes SVM Implementation

In [0]:
import numpy as np

from scipy.sparse import spmatrix, coo_matrix

from sklearn.base import BaseEstimator
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.svm import LinearSVC

class NBSVM(BaseEstimator, LinearClassifierMixin, SparseCoefMixin):

    def __init__(self, alpha=1, C=1, beta=0.25, fit_intercept=False):
        self.alpha = alpha
        self.C = C
        self.beta = beta
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        if len(self.classes_) == 2:
            coef_, intercept_ = self._fit_binary(X, y)
            self.coef_ = coef_
            self.intercept_ = intercept_
        else:
            coef_, intercept_ = zip(*[
                self._fit_binary(X, y == class_)
                for class_ in self.classes_
            ])
            self.coef_ = np.concatenate(coef_)
            self.intercept_ = np.array(intercept_).flatten()
        return self

    def _fit_binary(self, X, y):
        p = np.asarray(self.alpha + X[y == 1].sum(axis=0)).flatten()
        q = np.asarray(self.alpha + X[y == 0].sum(axis=0)).flatten()
        p = np.asarray(p,dtype=np.float)
        q = np.asarray(q,dtype=np.float)
        r = np.log(p/np.abs(p).sum()) - np.log(q/np.abs(q).sum())
        b = np.log((y == 1).sum()) - np.log((y == 0).sum())

        if isinstance(X, spmatrix):
            indices = np.arange(len(r))
            r_sparse = coo_matrix(
                (r, (indices, indices)),
                shape=(len(r), len(r))
            )
            X_scaled = X * r_sparse
        else:
            X_scaled = X * r

        lsvc = LinearSVC(
            C=self.C,
            fit_intercept=self.fit_intercept,
            max_iter=10000
        ).fit(X_scaled, y)

        mean_mag =  np.abs(lsvc.coef_).mean()

        coef_ = (1 - self.beta) * mean_mag * r + \
                self.beta * (r * lsvc.coef_)

        intercept_ = (1 - self.beta) * mean_mag * b + \
                     self.beta * lsvc.intercept_

        return coef_, intercept_

In [0]:
import glob
import os
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def load_imdb():
    print("Vectorizing Training Text")
    data = pd.read_csv("2013_Bohol_earthquake-tweets_labeled.csv")
    data['Info'] = 'related'
    data.Info[('Not related' == data.Informativeness)] = 'not-related'
    data['Tweet_Text'] = data['Tweet_Text'].apply(lambda x: x.decode('unicode_escape').\
                                              encode('ascii', 'ignore').\
                                              strip())
    X = data[['Tweet_ID','Tweet_Text','Info']]
    X = X.values.tolist()

    data_pos = []
    data_neg = []

    for tweet in X:
        if tweet[2] == 'related':
            data_pos.append(tweet[1])
        else:
            data_neg.append(tweet[1])

    train_pos = data_pos[:1100]
    train_neg = data_neg[:1100]

    token_pattern = r'\w+|[%s]' % string.punctuation

    vectorizer = TfidfVectorizer(ngram_range=(1, 3),
                                 token_pattern=token_pattern,
                                 binary=True)
    X_train = vectorizer.fit_transform(train_pos+train_neg)
    y_train = np.array([1]*len(train_pos)+[0]*len(train_neg))

    print("Vocabulary Size: %s" % len(vectorizer.vocabulary_))
    print("Vectorizing Testing Text")

    test_pos = data_pos[1100:]
    test_neg = data_neg[1100:]

    X_test = vectorizer.transform(test_pos + test_neg)
    y_test = np.array([1]*len(test_pos)+[0]*len(test_neg))

    return X_train, y_train, X_test, y_test

def main():

    X_train, y_train, X_test, y_test = load_imdb()

    print("Fitting Model")

    mnbsvm = NBSVM()
    mnbsvm.fit(X_train, y_train)
    print('Test Accuracy: %s' % mnbsvm.score(X_test, y_test))

# if __name__ == '__main__':
pd.set_option('display.max_rows', 2500)
main()
