# Crisis-data-processing

In [14]:
import re
import numpy as np
import pandas as pd

In [15]:
df_gt = pd.read_csv('data/data-gold.csv')
df_gt = df_gt[df_gt['_golden'] == False]

In [16]:
df_gt.head(3)

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event:confidence,type_of_message,type_of_message:confidence,nil,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_gold,tweet,tweet_no,tweet_no_rt,type_of_message_gold,user
0,238841781,False,finalized,4,1/2/2013 13:37:11,,,"Informative: offers/gives donations of money, ...",0.2689,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
1,238841782,False,finalized,4,12/24/2012 14:05:56,True,1.0,Not informative: personal only,0.7772,,,@ChrisMara816: Screw #sandy we skipped right a...,116293,@ChrisMara816: Screw #sandy we skipped right a...,,kaatteexo
2,238841783,False,finalized,4,12/24/2012 14:05:56,True,1.0,Informative: information source with extensive...,0.2554,,,On The Learning Network Sandy as a Teaching To...,1091,On The Learning Network Sandy as a Teaching To...,,LotsToLearn


### Selectivity of categories

In [17]:
print(df_gt['type_of_message'].value_counts()/len(df_gt))
print('------------------------------------')
print('the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event:      {}'.format(len(df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True)]) / len(df_gt)))

Not informative: personal only                                                        0.374356
Can not judge (not in English, too short, etc.)                                       0.126674
Informative: damage (building, road, lines, etc.)                                     0.109681
Informative: caution or advice                                                        0.064882
Informative: other type of photos/videos (not in the above classes)                   0.063852
Informative: information source with extensive coverage (radio, tv, website, etc.)    0.063337
Informative: other                                                                    0.059732
Not informative: unrelated to the disaster                                            0.043769
Informative: offers/gives donations of money, goods, or free services                 0.027291
Informative: requests donations of money, goods, or free services                     0.024717
Informative: celebrities or authorities react to t

### % of IN tweets for (3 predicates):
author_is_eye_witness_of_the_event ^ Informative ^ damage (building, road, lines, etc.) = 3%

In [18]:
df_relevant = df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True) & (
    df_gt['type_of_message'] == 'Informative: damage (building, road, lines, etc.)')]

len(df_relevant) / len(df_gt)

0.03038105046343975

### % of IN tweets for (3 predicates):
not author_is_eye_witness_of_the_event ^ Informative ^ damage (building, road, lines, etc.) = 8%

In [19]:
df_relevant = df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] != True) & (
    df_gt['type_of_message'] == 'Informative: damage (building, road, lines, etc.)')]

len(df_relevant) / len(df_gt)

0.07929969104016478

### % of IN tweets for (2 predicates):
not author_is_eye_witness_of_the_event ^ Informative = 37%

In [20]:
df_relevant = df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] != True) & (
    df_gt['type_of_message'].str.startswith('Informative:'))]

len(df_relevant) / len(df_gt)

0.3717816683831102

# Crisis-data transformation

In [21]:
df = pd.read_csv('data/data-crowdsourced.csv')
df = df[df['_golden'] == False]
df.head(3)

Unnamed: 0,_unit_id,_created_at,_golden,_id,_missed,_started_at,_tainted,_channel,_trust,_worker_id,...,_ip,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event,type_of_message,nil,the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event_gold,tweet,tweet_no,tweet_no_rt,type_of_message_gold,user
0,238841781,12/24/2012 13:41:45,False,787060207,,12/24/2012 13:37:36,False,instagc,0.7333,14425455,...,69.136.129.135,,Informative: other,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
1,238841781,12/24/2012 13:44:17,False,787063467,,12/24/2012 13:39:45,False,instagc,0.8333,13441146,...,98.18.108.46,,"Informative: offers/gives donations of money, ...",,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler
2,238841781,12/24/2012 13:54:14,False,787076220,,12/24/2012 13:51:24,False,golddiggergpt,0.8182,11092052,...,108.92.226.94,,Informative: celebrities or authorities react ...,,,important --&gt; @JebBush suggests federal gov...,11899,important --&gt; @JebBush suggests federal gov...,,danholler


# Transform data

### not author_is_eye_witness_of_the_event ^ Informative = 37%

In [22]:
# eye_witness
# the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event
data = []
item_id = 0
## Compute workers' accuracy
informative_true_votes = 0
not_eye_witness_true_votes = 0
for unit_id in df_gt['_unit_id'].unique():
    item_df = df[df['_unit_id'] == unit_id]
    not_eye_witness_in = not_eye_witness_out = 0
    informative_in = informative_out = 0
    y_in = y_out = 0
    
    gt_witness = df_gt[df_gt['_unit_id']==unit_id]['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'].values[0] == True
    gt_inf = 'Informative' in df_gt[df_gt['_unit_id']==unit_id]['type_of_message'].values[0]
    y = 1 if gt_witness and gt_inf else 0
    for row_id, row in item_df.iterrows():
        ## Compute workers' accuracy
        if (row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True) == gt_witness:
            not_eye_witness_true_votes += 1
        if ('Informative' in row['type_of_message']) == gt_inf:
            informative_true_votes += 1
        
        if row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True:
            not_eye_witness_out += 1
        else:
            not_eye_witness_in += 1
        if 'Informative' in row['type_of_message']:
            informative_in += 1
        else:
            informative_out += 1
        if row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] != True and 'Informative' in row['type_of_message']:
            y_in += 1
        else:
            y_out +=1
    text = df_gt[df_gt['_unit_id'] == unit_id]['tweet'].values[0]
    data.append([item_id, not_eye_witness_in, not_eye_witness_out, informative_in, informative_out,
                y_in, y_out, y, text])
    item_id += 1
df_tr = pd.DataFrame(data, columns=['item_id', 'not_eye_witness_in', 'not_eye_witness_out', 'informative_in', 'informative_out',
                'y_in', 'y_out', 'y', 'text'])

# df_tr.to_csv('data/crisis_transformed_notwitness_inf.csv', index=False)

In [23]:
print("Crowd Accuracy on 'informative' pred: {}".format(informative_true_votes / len(df)))
print("Crowd Accuracy on 'not witness' pred: {}".format(not_eye_witness_true_votes / len(df)))

Crowd Accuracy on 'informative' pred: 0.855
Crowd Accuracy on 'not witness' pred: 0.8741666666666666


### Stemming, Lemmatising and cleaning text

In [32]:
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import string
import re
from collections import Counter

text_cleaned = []

df = pd.read_csv('data/crisis_transformed_notwitness_inf.csv')

# Replace all numbers with special strings
regx = re.compile(r"\b[\d.]+\b")
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

for _, row in df.iterrows():
    text = row['text']
#     with stemming
#     text = [porter.stem(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # without stemming
#     text = [word.strip() for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # with lemmatizer
    text = [wordnet_lemmatizer.lemmatize(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
         
    text_cleaned.append(text)
    
# Findining Phrases (ie bi-grams)
# train bi-grams
bigram = Phrases()
bigram.add_vocab(text_cleaned)

# create phrases
text_cleaned_phrases = []
for text_ in text_cleaned:
    text_cleaned_phrases.append(bigram[text_])

text_cleaned_phrases_joined = [' '.join(text) for text in text_cleaned_phrases]
df['text'] = pd.Series(text_cleaned_phrases_joined, index=df.index)
df['text'] = df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) 

# df.to_csv('data/crisis-lemmatized_notwitness_inf.csv', index=False)

