In [237]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
# plt.style.use('ggplot')
# % matplotlib inline

df = pd.read_csv('CrisisLexT26/2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv') # change the file location if needed

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
import preprocessor as p

emoji_re = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
emojis = [regexp_tokenize(t, emoji_re) for t in df.TweetText]
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
# en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
df.head()

split_percentage = 0.25
X_train2, X_test2, y_train2, y_test2 = train_test_split(df.ProcessedText, df.label, test_size=split_percentage, random_state=42) 
hash_pp_vec = HashingVectorizer(analyzer='word', non_negative=True)
X_hash_pp_train = hash_pp_vec.fit_transform(X_train2)  
X_hash_pp_test = hash_pp_vec.transform(X_test2)

mnb_hash_pp = MultinomialNB(0.1, False)
parameters = {'fit_prior':('True', 'False'), 'alpha':list(np.arange(0.1, 2, 0.1))}
# clf = GridSearchCV(mnb_hash_pp, parameters)
# clf.fit(X_hash_pp_train, y_train2)
mnb_hash_pp.fit(X_hash_pp_train, y_train2)
# mnb_hash_pp = clf.best_estimator_
# print clf.best_params_

from sklearn.cross_validation import cross_val_score, KFold 
from scipy.stats import sem 

def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)  
    scores = cross_val_score(clf, X, y, cv=cv) 
    print scores
    print ("Mean score: {0:.4f} (+/-{1:.4f})").format(np.mean(scores), sem(scores))

evaluate_cross_validation(mnb_hash_pp, X_hash_pp_train, y_train2, 10)

print 'Accuracy for training: {}'.format(mnb_hash_pp.score(X_hash_pp_train, y_train2))
print 'Accuracy for testing: {}'.format(mnb_hash_pp.score(X_hash_pp_test, y_test2))

y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test) 
print "Classification Report:" 
print metrics.classification_report(y_test2,y_hash_pp_predict,digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(y_test2,y_hash_pp_predict)



[0.78571429 0.85714286 0.73809524 0.8452381  0.85714286 0.72619048
 0.82142857 0.77108434 0.71084337 0.84337349]
Mean score: 0.7956 (+/-0.0179)
Accuracy for training: 0.972520908005
Accuracy for testing: 0.789285714286
Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.9333    0.6131    0.7401       137
       Related and informative     0.7211    0.9580    0.8228       143

                   avg / total     0.8249    0.7893    0.7823       280

Confusion Matrix:
[[ 84  53]
 [  6 137]]


## All 3 other dataset

In [238]:
df1 = pd.read_csv('CrisisLexT26/2013_Colorado_floods/2013_Colorado_floods-tweets_labeled.csv') # change the file location if needed
df2 = pd.read_csv('CrisisLexT26/2013_Australia_bushfire/2013_Australia_bushfire-tweets_labeled.csv') # change the file location if needed
df3 = pd.read_csv('CrisisLexT26/2013_Queensland_floods/2013_Queensland_floods-tweets_labeled.csv') # change the file location if needed
df = pd.concat([df1, df2, df3])

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

Related and informative          2200
Related - but not informative     593
Not related                       573
Not applicable                     33
Name: Informativeness, dtype: int64

In [239]:
df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

Related and informative          1958
Related - but not informative     577
Not related                       573
Name: Informativeness, dtype: int64

In [240]:
label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label
0,376843697943769088,#Longmont #CO The Tiny Tim Center is now #hiri...,Not labeled,Not labeled,Not related,Not related or not informative
1,378011169883037697,"RT @dlfluegge: Crazy Flooding in Boulder, Colo...",Media,Sympathy and support,Related - but not informative,Not related or not informative
2,378020179214491649,Here's the #boulderflood video that's circulat...,Outsiders,Other Useful Information,Related and informative,Related and informative
3,378026101588496385,RT @passantino: Video: Severe flooding hits ne...,Media,Other Useful Information,Related and informative,Related and informative
4,378029784204206080,"Crazy Flooding in Boulder, Colorado http://t.c...",Media,Other Useful Information,Related and informative,Related and informative


In [241]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
#en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
df = df.sample(frac=1)
df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label,ProcessedText
711,381102510322499584,RT @TotalTrafficDEN: #Longmont closed on Hwy 2...,Media,Caution and advice,Related and informative,Related and informative,rt @totaltrafficden : #longmont closed on hwy ...
1599,392530239693139968,RT @702sydney: Incredible photo of @NSWRFS fir...,Media,Affected individuals,Related and informative,Related and informative,rt @702sydney : incredible photo of @nswrfs fi...
1510,392160092373528577,RT @MarshallThomasB: Abbott using the NSW bush...,Outsiders,Other Useful Information,Related - but not informative,Not related or not informative,rt @marshallthomasb : abbott using the nsw bus...
191,379140385554964480,Pray for the certain parts in Colorado that ar...,Outsiders,Sympathy and support,Related - but not informative,Not related or not informative,pray for the certain part in colorado that are...
1647,392702000640430082,RT @byers_brian: We are giving away World Seri...,Not labeled,Not labeled,Not related,Not related or not informative,rt @byers_brian : we are giving away world ser...


In [242]:
step = 1
threshold = 0.75
sublen = df.shape[0]/step
marks = [i*sublen for i in range(step)]
marks.append(df.shape[0]-1)
for i in range(step):
    test = df.ProcessedText[marks[i]:marks[i+1]]
    test_hash = hash_pp_vec.transform(test)
    predict_hash = mnb_hash_pp.predict(test_hash)
    probability = mnb_hash_pp.predict_proba(test_hash)
    new_text_list = []
    new_label_list = []
    for j in range(test.shape[0]):
        if probability[j][0] >= threshold or probability[j][1] >= threshold:
            new_text_list.append(test.values[j])
            new_label_list.append(predict_hash[j])
    new_text = np.asarray(new_text_list)
    new_label = np.asarray(new_label_list)
    new_text_hash = hash_pp_vec.transform(new_text)
    mnb_hash_pp.partial_fit(new_text_hash, new_label)



In [243]:
X_hash_pp_test = hash_pp_vec.transform(df.ProcessedText)

y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test)

print "Classification Report:" 
print metrics.classification_report(df.label,y_hash_pp_predict, digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(df.label,y_hash_pp_predict)
print "Accuracy:"
print metrics.accuracy_score(df.label,y_hash_pp_predict)



Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.9030    0.3643    0.5192      1150
       Related and informative     0.7235    0.9770    0.8314      1958

                   avg / total     0.7899    0.7503    0.7159      3108

Confusion Matrix:
[[ 419  731]
 [  45 1913]]
Accuracy:
0.7503217503217503
