## Loading Data

In [1]:
import pandas as pd
import numpy as np
import ujson as json


import string
import re

from textatistic import Textatistic

import spacy

import warnings

warnings.filterwarnings('ignore')

RSEED = 42


  from .autonotebook import tqdm as notebook_tqdm


## Data cleaning

In [2]:
nlp = spacy.load('en_core_web_lg')
stopwords = spacy.lang.en.STOP_WORDS

In [3]:
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas 
              if lemma.isalpha() and lemma not in stopwords]
    return a_lemmas

In [4]:
def remove_entities(text):
    doc = nlp(text)
    return(" ".join([ent.text for ent in doc if not ent.ent_type_]))
    

In [5]:
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)

In [6]:
def remove_ebola(text):
    words = re.compile('(\s*)ebola(\s*)')
    return words.sub(r" ", text)

In [7]:
def remove_mers(text):
    words = re.compile('(\s*)mers(\s*)')
    return words.sub(r" ", text)

In [8]:
def remove_helicopter(text):
    words = re.compile('(\s*)helicopter(\s*)')
    return words.sub(r" ", text)

In [9]:
def remove_train(text):
    words = re.compile('(\s*)train(\s*)')
    return words.sub(r" ", text)

In [10]:
import os
path = "../data_dimbat/incident-tweets/"
files = os.listdir(path)
df_list = list()
for file in files:
    records = map(json.loads, open(os.path.join(path, file), encoding="utf8"))
    df = pd.DataFrame.from_records(records)
    df["text_clean"] = df["text"].apply(lambda x: remove_html(x))
    df['lemmas'] = df['text_clean'].apply(preprocess)   
    df["text_lemma"] = [' '.join(map(str, x)) for x in df["lemmas"]]
    df['text_lemma'] = df['text_lemma'].apply(remove_entities)
    df_list.append(df)
   

In [11]:
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [12]:
for i in range(len(files)):
    if findWholeWord('ebola')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_ebola)
    elif findWholeWord('mers')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_mers)
    elif findWholeWord('helicopter')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_helicopter)
    elif findWholeWord('train')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_train)

In [13]:
for i in range(len(files)):
    if findWholeWord('earthquake')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 2)
    elif findWholeWord('flood')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 3)
    elif findWholeWord('hurricane')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 4)
    elif findWholeWord('tornado')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 4)    
    elif findWholeWord('wildfire')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 5)    
    elif findWholeWord('industrial')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 6)
    elif findWholeWord('societal')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 7)
    elif findWholeWord('transportation')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 8)    
    elif findWholeWord('meteor')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 9)
    elif findWholeWord('haze')(files[i]):
        df_list[i]['relevance'] = df_list[i]['relevance'].replace(1, 10)

In [14]:
index_2011 = list()
index_2012 = list()
index_2013 = list()
index_2014 = list()
index_2015 = list()
index_2017 = list()
index_2018 = list()


for i in range(len(files)):
    if findWholeWord('2011')(files[i]):
        index_2011.append(i)
    elif findWholeWord('2012')(files[i]):
        index_2012.append(i)
    elif findWholeWord('2013')(files[i]):
        index_2013.append(i)
    elif findWholeWord('2014')(files[i]):
        index_2014.append(i)   
    elif findWholeWord('2015')(files[i]):
        index_2015.append(i)
    elif findWholeWord('2017')(files[i]):
        index_2017.append(i)
    elif findWholeWord('2018')(files[i]):
        index_2018.append(i)

In [15]:
df_2012 = df_list[index_2012[0]]
for i in range(1, len(index_2012)):
   df_2012 = df_2012.append(df_list[index_2012[i]], ignore_index=True)

In [16]:
df_2013 = df_list[index_2013[0]]
for i in range(1, len(index_2013)):
   df_2013 = df_2013.append(df_list[index_2013[i]], ignore_index=True)

In [17]:
df_2014 = df_list[index_2014[0]]
for i in range(1, len(index_2014)):
   df_2014 = df_2014.append(df_list[index_2014[i]], ignore_index=True)

In [18]:
df_2011 = df_list[index_2011[0]]
for i in range(1, len(index_2011)):
   df_2011 = df_2011.append(df_list[index_2011[i]], ignore_index=True)

In [19]:
df_2015 = df_list[index_2015[0]]
for i in range(1, len(index_2015)):
   df_2015 = df_2015.append(df_list[index_2015[i]], ignore_index=True)

In [20]:
df_2017 = df_list[index_2017[0]]
for i in range(1, len(index_2017)):
   df_2017 = df_2017.append(df_list[index_2017[i]], ignore_index=True)

In [21]:
df_2018 = df_list[index_2018[0]]
for i in range(1, len(index_2018)):
   df_2018 = df_2018.append(df_list[index_2018[i]], ignore_index=True)

In [22]:
df_2011_new = df_2011[df_2011.relevance != 0]

In [23]:
df_2012_new = df_2012[df_2012.relevance != 0]

In [24]:
df_2012_new.relevance.value_counts()

4    6978
5     901
3     662
2     613
6      62
Name: relevance, dtype: int64

In [25]:
df_2012_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9216 entries, 0 to 18147
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          8332 non-null   object
 1   text        9216 non-null   object
 2   relevance   9216 non-null   int64 
 3   text_clean  9216 non-null   object
 4   lemmas      9216 non-null   object
 5   text_lemma  9216 non-null   object
dtypes: int64(1), object(5)
memory usage: 504.0+ KB


In [26]:
df_2013_new = df_2013[df_2013.relevance != 0]

In [27]:
df_2013_new.relevance.value_counts()

3     9254
2     5992
7     5842
4     4800
6     4782
8     2352
5      865
9      762
10     664
Name: relevance, dtype: int64

In [28]:
df_2013_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35313 entries, 0 to 68952
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          35313 non-null  object
 1   text        35313 non-null  object
 2   relevance   35313 non-null  int64 
 3   text_clean  35313 non-null  object
 4   lemmas      35313 non-null  object
 5   text_lemma  35313 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.9+ MB


In [29]:
df_2014_new = df_2014[df_2014.relevance != 0]

In [30]:
df_2014_new.relevance.value_counts()

3    3554
4    3187
1    3053
5    1454
Name: relevance, dtype: int64

In [31]:
df_2014_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11248 entries, 0 to 20698
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          11248 non-null  object
 1   text        11248 non-null  object
 2   relevance   11248 non-null  int64 
 3   text_clean  11248 non-null  object
 4   lemmas      11248 non-null  object
 5   text_lemma  11248 non-null  object
dtypes: int64(1), object(5)
memory usage: 615.1+ KB


In [32]:
df_2015_new = df_2015[df_2015.relevance != 0]

In [33]:
df_2015_new.relevance.value_counts()

2    2965
4    1943
Name: relevance, dtype: int64

In [34]:
df_2015_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 7872
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          4908 non-null   object
 1   text        4908 non-null   object
 2   relevance   4908 non-null   int64 
 3   text_clean  4908 non-null   object
 4   lemmas      4908 non-null   object
 5   text_lemma  4908 non-null   object
dtypes: int64(1), object(5)
memory usage: 268.4+ KB


In [35]:
df_2017_new = df_2017[df_2017.relevance != 0]

In [36]:
df_2017_new.relevance.value_counts()

4    11581
2     1681
3      740
Name: relevance, dtype: int64

In [37]:
df_2017_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14002 entries, 0 to 27520
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          14002 non-null  object
 1   text        14002 non-null  object
 2   relevance   14002 non-null  int64 
 3   text_clean  14002 non-null  object
 4   lemmas      14002 non-null  object
 5   text_lemma  14002 non-null  object
dtypes: int64(1), object(5)
memory usage: 765.7+ KB


In [38]:
df_2018_new = df_2018[df_2018.relevance != 0]

In [39]:
df_2018_new.relevance.value_counts()

2    5296
Name: relevance, dtype: int64

In [40]:
df_2018_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5296 entries, 0 to 5295
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          5296 non-null   object
 1   text        5296 non-null   object
 2   relevance   5296 non-null   int64 
 3   text_clean  5296 non-null   object
 4   lemmas      5296 non-null   object
 5   text_lemma  5296 non-null   object
dtypes: int64(1), object(5)
memory usage: 289.6+ KB


In [41]:
df_2011_new.to_pickle("../data/preprocess_train_dimbat_label_2011.pkl")

In [42]:
df_2012_new.to_pickle("../data/preprocess_train_dimbat_label_2012.pkl")

In [43]:
df_2013_new.to_pickle("../data/preprocess_train_dimbat_label_2013.pkl")

In [44]:
df_2014_new.to_pickle("../data/preprocess_train_dimbat_label_2014.pkl")

In [45]:
df_2015_new.to_pickle("../data/preprocess_train_dimbat_label_2015.pkl")

In [46]:
df_2017_new.to_pickle("../data/preprocess_train_dimbat_label_2017.pkl")

In [47]:
df_2018_new.to_pickle("../data/preprocess_train_dimbat_label_2018.pkl")