## Loading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ujson as json


import string
import re

from textatistic import Textatistic

import spacy

import warnings

warnings.filterwarnings('ignore')

RSEED = 42


  from .autonotebook import tqdm as notebook_tqdm


## Data cleaning

In [2]:
nlp = spacy.load('en_core_web_lg')
stopwords = spacy.lang.en.STOP_WORDS

In [3]:
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas 
              if lemma.isalpha() and lemma not in stopwords]
    return a_lemmas

In [4]:
def remove_entities(text):
    doc = nlp(text)
    return(" ".join([ent.text for ent in doc if not ent.ent_type_]))
    

In [5]:
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)

In [6]:
def remove_ebola(text):
    words = re.compile('(\s*)ebola(\s*)')
    return words.sub(r" ", text)

In [7]:
def remove_mers(text):
    words = re.compile('(\s*)mers(\s*)')
    return words.sub(r" ", text)

In [8]:
def remove_helicopter(text):
    words = re.compile('(\s*)helicopter(\s*)')
    return words.sub(r" ", text)

In [9]:
def remove_train(text):
    words = re.compile('(\s*)train(\s*)')
    return words.sub(r" ", text)

In [10]:
import os
path = "../data_dimbat/incident-tweets/"
files = os.listdir(path)
df_list = list()
for file in files:
    records = map(json.loads, open(os.path.join(path, file), encoding="utf8"))
    df = pd.DataFrame.from_records(records)
    df["text_clean"] = df["text"].apply(lambda x: remove_html(x))
    df['lemmas'] = df['text_clean'].apply(preprocess)   
    df["text_lemma"] = [' '.join(map(str, x)) for x in df["lemmas"]]
    df['text_lemma'] = df['text_lemma'].apply(remove_entities)
    df_list.append(df)
   

In [12]:
len(df_list)

48

In [13]:
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [15]:
for i in range(len(files)):
    if findWholeWord('ebola')(files[i]):
        print(files[i])
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_ebola)
    elif findWholeWord('mers')(files[i]):
        print(files[i])
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_mers)
    elif findWholeWord('helicopter')(files[i]):
        print(files[i])
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_helicopter)
    elif findWholeWord('train')(files[i]):
        print(files[i])
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_train)

transportation-glasgow-helicopter-crash-2013.ndjson
biological-mers-2014.ndjson
biological-ebola-2014.ndjson
transportation-spain-train-crash-2013.ndjson
transportation-la-train-crash-2013.ndjson
transportation-ny-train-crash-2013.ndjson


## Labels
- biological --- 1
- earthquake --- 2
- flood --- 3
- hurricane & tornado --- 4
- wildfire --- 5
- industrial --- 6
- societal --- 7
- transportation --- 8
- meteor --- 9
- haze --- 10

In [21]:
for i in range(len(files)):
    if findWholeWord('earthquake')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 2)
    elif findWholeWord('flood')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 3)
    elif findWholeWord('hurricane')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 4)
    elif findWholeWord('tornado')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 4)    
    elif findWholeWord('wildfire')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 5)    
    elif findWholeWord('industrial')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 6)
    elif findWholeWord('societal')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 7)
    elif findWholeWord('transportation')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 8)    
    elif findWholeWord('meteor')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 9)
    elif findWholeWord('haze')(files[i]):
        print(i, ": ", files[i])
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 10)
        

39 :  other-singapore-haze-2013.ndjson


In [22]:
df_list[39].head()

Unnamed: 0,id,text,relevance,text_clean,lemmas,text_lemma
0,345498158371045378,they should spin the flyer at a very high spee...,10,they should spin the flyer at a very high spee...,"[spin, flyer, high, speed, singapore, cool, bl...",spin flyer high speed cool blow haze away
1,345961847062671360,[st] haze update: clearer skies over singapore...,10,[st] haze update: clearer skies over singapore...,"[st, haze, update, clear, sky, singapore, satu...",st haze update clear sky psi moderate range
2,346194496733708288,<HASHTAG> haze update: <HASHTAG> psi is <NUMBE...,10,haze update: psi is at m. here's s map sh...,"[haze, update, psi, s, map, psi, location, sin...",haze update
3,346417675632787456,haze in singapore continues due to sumatra for...,10,haze in singapore continues due to sumatra for...,"[haze, singapore, continue, sumatra, forest, l...",continue like hill station sans chill
4,346467243934224387,pall of throat-scratching haze enveloping sing...,10,pall of throat-scratching haze enveloping sing...,"[pall, throat, scratch, haze, envelop, singapo...",pall throat scratch haze envelop turn smoke ma...


In [19]:
df_sum = df_list[0]

In [17]:
len(df_list)

48

In [20]:
for i in range(1, len(df_list)):
   print(i,": ", files[i])
   df_sum = df_sum.append(df_list[i], ignore_index=True)
    

1 :  biological-mers-2014.ndjson
2 :  earthquake-nepal-2015.ndjson
3 :  biological-ebola-2014.ndjson
4 :  earthquake-guatemala-2012.ndjson
5 :  earthquake-chile-2013.ndjson
6 :  tornado-joplin-2011.ndjson
7 :  hurricane-pam-2015.ndjson
8 :  hurricane-irma-2017.ndjson
9 :  societal-boston-bombing-2013.ndjson
10 :  tornado-oklahoma-2013.ndjson
11 :  industrial-texas-explosion-2013.ndjson
12 :  industrial-savar-building-collapse-2013.ndjson
13 :  hurricane-pablo-2012.ndjson
14 :  hurricane-hagupit-2014.ndjson
15 :  earthquake-bohol-2013.ndjson
16 :  transportation-spain-train-crash-2013.ndjson
17 :  hurricane-odile-2014.ndjson
18 :  hurricane-maria-2017.ndjson
19 :  earthquake-pakistan-2013.ndjson
20 :  other-russia-meteor-2013.ndjson
21 :  earthquake-italy-2012.ndjson
22 :  flood-philipinnes-2012.ndjson
23 :  flood-sardinia-2013.ndjson
24 :  earthquake-nepal-2018.ndjson
25 :  flood-colorado-2013.ndjson
26 :  flood-alberta-2013.ndjson
27 :  flood-pakistan-2014.ndjson
28 :  industrial-vene

In [35]:
df_sum.sample(20)

Unnamed: 0,id,text,relevance,text_clean,lemmas,text_lemma
161551,563173606146859008,working out is calling my name right now,0,working out is calling my name right now,"[work, right]",work right
52748,'324826681556738051',"deadly explosion, fire rip through <HASHTAG> t...",6,"deadly explosion, fire rip through texas fert...","[deadly, explosion, fire, rip, texas, fertiliz...",deadly explosion fire rip fertilizer plant
59788,774238002906296320,brand new <HASHTAG> startrek t-shirts have lan...,0,brand new startrek t-shirts have landed!,"[brand, new, startrek, t, shirt, land]",brand new startrek t shirt land
128333,295843679048986624,live: flood disaster unfolds as weather wreaks...,3,live: flood disaster unfolds as weather wreaks...,"[live, flood, disaster, unfold, weather, wreak...",live flood disaster unfold weather wreak havoc
1835,'460444343091081216',saudi arabia finds another <NUMBER> <HASHTAG> ...,1,saudi arabia finds another mers cases as dis...,"[saudi, arabia, find, mer, case, disease, spread]",find mer case disease spread
40043,424276277257052160,<USER> by introducing her to me 💪👍,0,by introducing her to me 💪👍,"[introduce, I]",introduce I
37638,'324994252218195968',prior to fbi news conference regarding boston ...,7,prior to fbi news conference regarding boston ...,"[prior, fbi, news, conference, regard, boston,...",prior news conference regard marathon explosion
68597,'511445762644140032',my prayers going out to the people injured in ...,4,my prayers going out to the people injured in ...,"[prayer, people, injure, hurricane, odile]",prayer people injure hurricane odile
125323,22710491570765824,crossin my fingers that everything goes good t...,0,crossin my fingers that everything goes good t...,"[crossin, finger, good, month]",crossin finger good month
160138,910658565874085888,[author: jason-duaine-hahn] <URL> mexico was h...,2,[author: jason-duaine-hahn] mexico was hit by...,"[author, jason, duaine, hahn, mexico, hit, dev...",author hit devastating earthquake


In [36]:
df_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163718 entries, 0 to 163717
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          160958 non-null  object
 1   text        163718 non-null  object
 2   relevance   163718 non-null  int64 
 3   text_clean  163718 non-null  object
 4   lemmas      163718 non-null  object
 5   text_lemma  163718 non-null  object
dtypes: int64(1), object(5)
memory usage: 7.5+ MB


In [23]:
df_sum["relevance"].value_counts()

0     81859
4     30365
2     16547
3     14210
7      5842
6      4844
5      3220
1      3053
8      2352
9       762
10      664
Name: relevance, dtype: int64

In [24]:
df_sum.to_pickle("../data/preprocess_train_dimbat.pkl")

In [38]:
df_new = pd.read_pickle("../data/preprocess_train_dimbat.pkl")

In [40]:
df_new.sample(30)

Unnamed: 0,id,text,relevance,text_clean,lemmas,text_lemma
138339,875785718249009152,<USER> can't wait homie. have a great day,0,can't wait homie. have a great day,"[wait, homie, great, day]",wait homie great day
74682,851485902077612032,my og really just pissed me off😤,0,my og really just pissed me off😤,"[og, piss, I]",og piss I
160075,910624850703773696,over <NUMBER> children found dead during post ...,2,over children found dead during post earthqua...,"[child, find, dead, post, earthquake, rescue, ...",child find dead post earthquake rescue school
115664,922161026912669698,crews extinguish brush <HASHTAG> fire caused b...,5,crews extinguish brush fire caused by possibl...,"[crew, extinguish, brush, fire, cause, possibl...",crew extinguish brush fire cause possible
25822,905723583892217856,hurricane irma's path shifts: see the new fore...,4,hurricane irma's path shifts: see the new fore...,"[hurricane, irma, path, shift, new, forecast]",path shift new forecast
129633,757466196300009472,why do these people are trying to get on my ne...,0,why do these people are trying to get on my ne...,"[people, try, nerve, test, level, patience, ehh]",people try nerve test level patience ehh
76376,624639049253281792,for the first time ever i'm actually scared to...,0,for the first time ever i'm actually scared to...,"[time, I, actually, scared, happy]",time I actually scared happy
136346,407174543061823488,and now so sad to hear of the metro north dera...,8,and now so sad to hear of the metro north dera...,"[sad, hear, metro, north, derailment, morning,...",sad hear metro north derailment pray
100069,'349277414968393728',"hey windsor-essex, if you know of anyone local...",3,"hey windsor-essex, if you know of anyone local...","[hey, windsor, essex, know, local, affect, alb...",hey windsor essex know local affect alberta fl...
4331,'591902722430873600',"""this is why i love nepal. people here help on...",2,"""this is why i love nepal. people here help on...","[I, love, nepal, people, help, know, govt]",I love nepal people help know govt
