In [1]:
# EDA of the CrisisBench dataset

In [2]:
import os
import random

import numpy as np

import src.aidrtokenize as aidrtokenize

data_path = "../data/data/all_data_en"

# Informativeness

Binary labels: "not_informative", "informative"

In [3]:
def read_stop_words(file_name):
    stop_words =[]
    with open(file_name, newline=None) as f:
        for line in f:
            line = line.strip()
            if line == "":
                continue
            stop_words.append(line)
    return stop_words

stop_words_file="../src/stop_words_english.txt"
stop_words = read_stop_words(stop_words_file)
stop_words

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 "can't",
 'cannot',
 'could',
 "couldn't",
 'did',
 "didn't",
 'do',
 'does',
 "doesn't",
 'doing',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 "hadn't",
 'has',
 "hasn't",
 'have',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 "here's",
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 "how's",
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 "let's",
 'me',
 'more',
 'most',
 "mustn't",
 'my',
 'myself',
 'no',
 'nor',
 'not',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'ought',
 'our',
 'ours\tourselves',
 'out',
 'over',
 'own',
 'same',
 "shan't",
 'she',
 "she'd",
 "she'll",
 "she's",
 'should',
 "shou

In [4]:
from sklearn import preprocessing

def read_data(path, remove_stopwords=False, shuffle=False, delim="\t"):
    texts_raw = []
    labels_raw = []
    c_bad_chars = 0
    total_num_words = 0
    with open(path, "r", newline=None, encoding='utf-8', errors='replace') as f:
        next(f) # skip col titles
        for line in f:

            line = line.strip()
            if line== "":
                continue
            row = line.split(delim)
            txt = row[3].strip() # text col
            txt = txt.replace("'", "")
            txt = aidrtokenize.tokenize(txt)

            label = row[6] # label col
            txt = txt.replace("'", "")
            w_list = []

            for w in txt.split():
                if remove_stopwords:
                    if w not in stop_words:
                        try:
                            #w=str(w.encode('ascii'))
                            w = w.encode('utf-8')
                            w_list.append(w)
                        except Exception as e:
                            print(w)
                            pass
                else:
                    try:
                        w = w.encode('utf-8')
                        total_num_words += 1
                        if any(c in w for c in [b'\xe2', b'\xc3', b'\xef', b'\xc2', b'\xe1']):
                           print("Problematic chars", w.decode('utf-8'), w)
                           c_bad_chars += 1
                        else:
                            w_list.append(w)
                    except Exception as e:
                        print("exception")
                        print(w)
                        pass
            text = b' '.join(w_list)
            text = str(text)

            # if(len(text)<1):
            #     print txt
            #     continue
            #txt=aidrtokenize.tokenize(txt)
            #txt=[w for w in txt if w not in stop_words]
            if isinstance(text, str):
                texts_raw.append(text)
                labels_raw.append(label)
            else:
                print(text)
    print(c_bad_chars, "bad chars")
    print(total_num_words, "words in total")
    if shuffle:
        texts_shuf = []
        labels_shuf = []
        index_shuf = range(len(texts_raw))
        random.shuffle(index_shuf)

        for i in index_shuf:
            texts_shuf.append(texts_raw[i])
            labels_shuf.append(labels_raw[i])

        texts = texts_shuf
        labels = labels_shuf
    else:
        texts = texts_raw
        labels = labels_raw


    label_encoder = preprocessing.LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    labels_encoded = labels_encoded.tolist()

    classes = list(label_encoder.classes_)
    print(classes)
    num_classes = len(set(labels_encoded))

    if num_classes > 2:
        # one-hot encoding
        num_labels = len(labels_encoded)
        labels_one_hot = np.zeros((num_labels, num_classes))
        labels_one_hot[np.arange(num_labels), labels_encoded] = 1
        labels_one_hot = np.array(labels_one_hot, dtype=np.int32)
        return {"texts": texts, "labels": labels_one_hot}
    else:
        return {"texts": texts, "labels": labels_encoded}



In [6]:
informativeness_dev = read_data(os.path.join(data_path,
                                   "crisis_consolidated_informativeness_filtered_lang_en_dev.tsv"))
informativeness_dev

Problematic chars — b'\xe2\x80\x94'
Problematic chars zeroedge� b'zeroedge\xef\xbf\xbd'
Problematic chars irma‚äôs b'irma\xe2\x80\x9a\xc3\xa4\xc3\xb4s'
Problematic chars âœœácâï¸á™ác b'\xc3\xa2\xc5\x93\xc5\x93\xc3\xa1c\xc3\xa2\xc2\x9d\xc3\xaf\xc2\xb8\xc3\xa1\xe2\x84\xa2\xc3\xa1c'
Problematic chars httpâ€ b'http\xc3\xa2\xe2\x82\xac'
Problematic chars httpâ€ b'http\xc3\xa2\xe2\x82\xac'
Problematic chars â€ b'\xc3\xa2\xe2\x82\xac'
Problematic chars ‚äî b'\xe2\x80\x9a\xc3\xa4\xc3\xae'
Problematic chars b‚ä b'b\xe2\x80\x9a\xc3\xa4'
Problematic chars ahaha�_��_� b'ahaha\xef\xbf\xbd_\xef\xbf\xbd\xef\xbf\xbd_\xef\xbf\xbd'
Problematic chars “ b'\xe2\x80\x9c'
Problematic chars ��_ b'\xef\xbf\xbd\xef\xbf\xbd_'
Problematic chars in¬ b'in\xc2\xac'
Problematic chars in��_ b'in\xef\xbf\xbd\xef\xbf\xbd_'
Problematic chars �_��_�d�_�b b'\xef\xbf\xbd_\xef\xbf\xbd\xef\xbf\xbd_\xef\xbf\xbdd\xef\xbf\xbd_\xef\xbf\xbdb'
Problematic chars áÿáÿ b'\xc3\xa1\xc3\xbf\xc3\xa1\xc3\xbf'
Problematic chars á‰áÿaáÿáÿd 

{'texts': ["b'ways to survive and escape martial law world'",
  "b'back in sunny paris and bumped into these two gorgeous ladies already amp'",
  "b'otherwise picture of our new optical parametric oscillator and amplifier'",
  "b'this is horrible please join us in prayer over dead at texas fertilizer plant explosion'",
  "b'can nxt target of nature what about uttarakhand flood then'",
  "b'imagine room with walls that are lava lamps'",
  "b'beautiful piece by cant even imagine perseverance and grace as nepal is turned upside down'",
  "b'strong winds caused an electrical wire to come loose cutting the power supply to more than residents in ningbo'",
  "b'this kid got knocked out haha fight knockout free prayforchile worldstar'",
  "b'special purrs to nepal amp chile'",
  "b'for all those who are in hurricane sandy path our thoughts and prayers are with you please stay safe and god bless'",
  "b'always had taste mack ged on the level'",
  "b'sending thoughts and prayers to the oklahoma 

In [2]:
## Trying out the NLTK tokenizer
from nltk.tokenize import word_tokenize, TweetTokenizer

!pip install emoji
from emoji import demojize
from nltk.tokenize import TweetTokenizer


tokenizer = TweetTokenizer()


def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token


def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("cannot ", "can not ")
            .replace("n't ", " n't ")
            .replace("n 't ", " n't ")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("'m ", " 'm ")
            .replace("'re ", " 're ")
            .replace("'s ", " 's ")
            .replace("'ll ", " 'll ")
            .replace("'d ", " 'd ")
            .replace("'ve ", " 've ")
    )
    normTweet = (
        normTweet.replace(" p . m .", "  p.m.")
            .replace(" p . m ", " p.m ")
            .replace(" a . m .", " a.m.")
            .replace(" a . m ", " a.m ")
    )

    return " ".join(normTweet.split())


def read_data(path, delim="\t"):
    texts_raw = []
    labels_raw = []

    with open(path, "r", newline=None, encoding='utf-8', errors='replace') as f:
        next(f) # skip col titles
        for line in f:
            line = line.strip()
            if line== "":
                continue
            row = line.split(delim)

            txt = row[3].strip() # text col
            txt = normalizeTweet(txt)
            print(txt)

            #labels_raw.append(row[6]) # label col


Collecting emoji
  Using cached emoji-1.6.3.tar.gz (174 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25ldone
[?25h  Created wheel for emoji: filename=emoji-1.6.3-py3-none-any.whl size=170298 sha256=a2cd43711bc7cd82f0e892cd106374d96b8042c5817954b09f15990e14d7f685
  Stored in directory: /home/angelie/.cache/pip/wheels/e5/c7/b3/b62b7809b5ea6d22241a713d9cc05741d7edaec674b9d61cb7
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.6.3


In [3]:
import os
data_path = "../data/data/all_data_en"
informativeness_dev = read_data(os.path.join(data_path,
                                   "crisis_consolidated_informativeness_filtered_lang_en_dev.tsv"))
informativeness_dev

10 Ways To Survive and Escape Martial Law | World HTTPURL HTTPURL
Back in #sunny #Paris and bumped into these two gorgeous ladies already ! @USER & @USER � � _ HTTPURL
@USER Otherwise a picture of our new Optical Parametric Oscillator and Amplifier . HTTPURL
This is horrible ; please join us in prayer . Over 70 dead at Texas fertilizer plant explosion . HTTPURL
@USER @USER u can b d nxt target of nature ... what about uttarakhand flood then ?
Imagine a room with walls that are lava lamps .
Beautiful piece by @USER . I can't even imagine . Perseverance and Grace as Nepal Is Turned Upside Down HTTPURL
Strong winds caused an electrical wire to come loose , cutting the power supply to more than 300 residents in Ningbo .
This kid got knocked out haha HTTPURL #fight #knockout #free #PrayForChile #worldstar
Special PURRs to Nepal & Chile -
RT @USER : For all those who are in hurricane Sandy 's path — our thoughts and prayers are with you . Please stay safe and God bless .
@USER always had tas

# Humanitarian

Multiple labels