In [128]:
# %load process_tweet
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
pd.set_option('display.max_rows', 100)

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from symspellpy.symspellpy import SymSpell, Verbosity

import process_tweet
import importlib
importlib.reload(process_tweet)

[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ashwin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<module 'process_tweet' from '/home/ashwin/Projects/Harvey_Machine_Learning/process_tweet.py'>

In [130]:
#now read in raw labeled tweets
df = pd.read_csv('data/labeled_prelim.csv').dropna()
df = df.astype({'Relevancy':np.int32, 'Urgency':np.int32})
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Id,Text,Relevancy,Urgency
0,247434,More millions in #Afghanistan even with ZERO a...,0,0
1,294115,These are the last post my brother made on soc...,2,1
2,24622,In @cityofcc listening to local officials abou...,0,0
3,37807,So so so damn proud of @5ugarcane who is tirel...,3,0
4,37386,How can you help with #Harvey disaster respons...,0,0


In [131]:
#now preprocess the tweets
sym_spell = create_symspell(2,7,'data/frequency_dictionary_en_82_765.txt')
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [44]:
text = df['Text'].map(lambda x: process_tweet.process_tweet(x, tknzr, sym_spell))
text[0]

'millions afghanistan even zero attack isis sympathizers invest texas nation build harvey texas flood'

Our next goal is to extract a bunch of features from the data and use feature selection methods to reduce the dimensionality of the data. This work is based on the research of Krouska et al. These are the features that will be extracted:
* tf-idf
* n-gram

In [150]:
#calculate tf-idf features and add to dataframe
vectorizer = TfidfVectorizer(ngram_range=(1,3))
vecs = vectorizer.fit_transform(text).toarray()
labels = list(map(lambda x: 'tf-idf: ' + str(x), range(vecs.shape[1])))

df2 = pd.DataFrame(vecs, columns=labels)
df2['Relevancy'] = df['Relevancy']
df2['Urgency'] = df['Urgency']
df2.head()

Unnamed: 0,tf-idf: 0,tf-idf: 1,tf-idf: 2,tf-idf: 3,tf-idf: 4,tf-idf: 5,tf-idf: 6,tf-idf: 7,tf-idf: 8,tf-idf: 9,...,tf-idf: 25373,tf-idf: 25374,tf-idf: 25375,tf-idf: 25376,tf-idf: 25377,tf-idf: 25378,tf-idf: 25379,tf-idf: 25380,Relevancy,Urgency
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.173237,0.173237,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0


In [153]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import * 
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import *
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def get_stats(model, X, y, cv, verbose=False):
    accuracy = []
    precision = []
    recall = []
    f1 = []
    auc = []
        
    cv_results = cross_validate(model, X, y, scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'], 
                                cv=cv, return_train_score=False)
    
    new_results = {}
    for k,v in cv_results.items():
        new_results[k] = np.mean(v)
    
    if verbose:
        print(new_results)
    
    #now return the data
    return new_results

df3 = df2.copy()
df3.pop('Relevancy')
labels = df3.pop('Urgency').map(lambda x: 1 if x == 2 else 0)

model = AdaBoostClassifier()
get_stats(model, df3, labels, cv=5)

{'fit_time': 5.779833889007568,
 'score_time': 0.6838139057159424,
 'test_accuracy': 0.965751323323544,
 'test_precision': 0.33333333333333337,
 'test_recall': 0.15333333333333332,
 'test_f1': 0.1904812834224599,
 'test_roc_auc': 0.6524034292366184}