# Crisis-data-processing

In [2]:
import re
import numpy as np
import pandas as pd

In [4]:
df_gt = pd.read_csv('data/data-gold.csv')
df_gt = df_gt[df_gt['_golden'] == False]

### Selectivity of categories

In [5]:
print(df_gt['type_of_message'].value_counts()/len(df_gt))
print('------------------------------------')
print('the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event:      {}'.format(len(df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True)]) / len(df_gt)))

Not informative: personal only                                                        0.374356
Can not judge (not in English, too short, etc.)                                       0.126674
Informative: damage (building, road, lines, etc.)                                     0.109681
Informative: caution or advice                                                        0.064882
Informative: other type of photos/videos (not in the above classes)                   0.063852
Informative: information source with extensive coverage (radio, tv, website, etc.)    0.063337
Informative: other                                                                    0.059732
Not informative: unrelated to the disaster                                            0.043769
Informative: offers/gives donations of money, goods, or free services                 0.027291
Informative: requests donations of money, goods, or free services                     0.024717
Informative: celebrities or authorities react to t

### % of IN tweets for (3 predicates):
author_is_eye_witness_of_the_event ^ Informative ^ damage (building, road, lines, etc.) = 3%

In [6]:
df_relevant = df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True) & (
    df_gt['type_of_message'] == 'Informative: damage (building, road, lines, etc.)')]

len(df_relevant) / len(df_gt)

0.03038105046343975

### % of IN tweets for (3 predicates):
not author_is_eye_witness_of_the_event ^ Informative ^ damage (building, road, lines, etc.) = 8%

In [7]:
df_relevant = df_gt[(df_gt['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] != True) & (
    df_gt['type_of_message'] == 'Informative: damage (building, road, lines, etc.)')]

len(df_relevant) / len(df_gt)

0.07929969104016478

## Make 3 predicates - Data Transformation

In [28]:
df = pd.read_csv('data/data-crowdsourced.csv')
df = df[df['_golden'] == False]
# df.head(2)

### author_is_eye_witness_of_the_event ^ Informative ^ damage (building, road, lines, etc.) = 3%

In [31]:
# eye_witness
# the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event
data = []
item_id = 0
## Compute workers' accuracy
informative_true_votes = 0
eye_witness_true_votes = 0
damage_true_votes = 0
y_true_votes = 0
for unit_id in df_gt['_unit_id'].unique():
    item_df = df[df['_unit_id'] == unit_id]
    eye_witness_in = eye_witness_out = 0
    informative_in = informative_out = 0
    damage_in = damage_out = 0
    y_in = y_out = 0
    
    gt_witness = df_gt[df_gt['_unit_id']==unit_id]['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'].values[0] == True
    gt_inf = 'Informative' in df_gt[df_gt['_unit_id']==unit_id]['type_of_message'].values[0]
    gt_damage = 'damage' in df_gt[df_gt['_unit_id']==unit_id]['type_of_message'].values[0]
    y = 1 if gt_witness and gt_inf and gt_damage else 0
    for row_id, row in item_df.iterrows():
        if row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True:
            eye_witness_in += 1
        else:
            eye_witness_out += 1
        if 'Informative' in row['type_of_message']:
            informative_in += 1
        else:
            informative_out += 1
        if 'damage' in row['type_of_message']:
            damage_in += 1
        else:
            damage_out += 1
            
        ## Compute workers' accuracy
        if (row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True) == gt_witness:
            eye_witness_true_votes += 1
        if ('Informative' in row['type_of_message']) == gt_inf:
            informative_true_votes += 1
        if ('damage' in row['type_of_message']) == gt_damage:
            damage_true_votes += 1
        y_crowdsourced = (row['the_author_of_the_tweet_seems_to_be_an_eye_witness_of_the_event'] == True) and ('Informative' in row['type_of_message']) and ('damage' in row['type_of_message'])
        if y_crowdsourced == y:
            y_true_votes += 1
            
        if y_crowdsourced:
            y_in += 1
        else:
            y_out +=1
    text = df_gt[df_gt['_unit_id'] == unit_id]['tweet'].values[0]
    data.append([item_id, int(gt_witness), eye_witness_in, eye_witness_out, int(gt_inf), informative_in, informative_out, int(gt_damage), damage_in, damage_out,
                y_in, y_out, y, text])
    item_id += 1
df_tr = pd.DataFrame(data, columns=['item_id', 'eye_witness', 'eye_witness_in', 'eye_witness_out', 'informative', 'informative_in', 'informative_out',
                'damage', 'damage_in', 'damage_out', 'Y_in', 'Y_out', 'Y', 'text'])

# df_tr.to_csv('data/crisis_transformed_witness_inf_damage.csv', index=False)

In [32]:
print("Crowd Accuracy on 'informative' pred: {}".format(informative_true_votes / len(df)))
print("Crowd Accuracy on 'eye witness' pred: {}".format(eye_witness_true_votes / len(df)))
print("Crowd Accuracy on 'damage' pred: {}".format(damage_true_votes / len(df)))
print('----------------------------')
print("Crowd Accuracy on 'informative^witness^damage': {}".format(y_true_votes / len(df)))

Crowd Accuracy on 'informative' pred: 0.855
Crowd Accuracy on 'eye witness' pred: 0.8741666666666666
Crowd Accuracy on 'damage' pred: 0.9035
----------------------------
Crowd Accuracy on 'informative^witness^damage': 0.9691666666666666


### Stemming, Lemmatising and cleaning text

In [34]:
from gensim.models import Phrases
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
import string
import re
from collections import Counter

text_cleaned = []

df = pd.read_csv('data/crisis_transformed_witness_inf_damage.csv')

# Replace all numbers with special strings
regx = re.compile(r"\b[\d.]+\b")
porter = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

for _, row in df.iterrows():
    text = row['text']
#     with stemming
#     text = [porter.stem(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # without stemming
#     text = [word.strip() for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
    
#     # with lemmatizer
    text = [wordnet_lemmatizer.lemmatize(word.strip()) for word in nltk.word_tokenize(text.lower()) if (word not in string.punctuation) and (word not in stopwords.words("english"))]
         
    text_cleaned.append(text)
    
# Findining Phrases (ie bi-grams)
# train bi-grams
bigram = Phrases()
bigram.add_vocab(text_cleaned)

# create phrases
text_cleaned_phrases = []
for text_ in text_cleaned:
    text_cleaned_phrases.append(bigram[text_])

text_cleaned_phrases_joined = [' '.join(text) for text in text_cleaned_phrases]
df['text'] = pd.Series(text_cleaned_phrases_joined, index=df.index)
df['text'] = df['text'].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem)) 
df.rename(columns={'text':'tokens'}, inplace=True)

# df.to_csv('data/crisis-lemmatized_witness_inf_damage.csv', index=False)

