## Loading Data

In [26]:
import pandas as pd
import numpy as np
import ujson as json


import string
import re

from textatistic import Textatistic

import spacy

import warnings

warnings.filterwarnings('ignore')

RSEED = 42


## Data cleaning

In [27]:
nlp = spacy.load('en_core_web_lg')
stopwords = spacy.lang.en.STOP_WORDS

In [28]:
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas 
              if lemma.isalpha() and lemma not in stopwords]
    return a_lemmas

In [29]:
def remove_entities(text):
    doc = nlp(text)
    return(" ".join([ent.text for ent in doc if not ent.ent_type_]))
    

In [30]:
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)

In [31]:
def remove_ebola(text):
    words = re.compile('(\s*)ebola(\s*)')
    return words.sub(r" ", text)

In [32]:
def remove_mers(text):
    words = re.compile('(\s*)mers(\s*)')
    return words.sub(r" ", text)

In [33]:
def remove_helicopter(text):
    words = re.compile('(\s*)helicopter(\s*)')
    return words.sub(r" ", text)

In [34]:
def remove_train(text):
    words = re.compile('(\s*)train(\s*)')
    return words.sub(r" ", text)

In [35]:
import os
path = "../data_dimbat/incident-tweets/"
files = os.listdir(path)
df_list = list()
for file in files:
    records = map(json.loads, open(os.path.join(path, file), encoding="utf8"))
    df = pd.DataFrame.from_records(records)
    df["text_clean"] = df["text"].apply(lambda x: remove_html(x))
    df['lemmas'] = df['text_clean'].apply(preprocess)   
    df["text_lemma"] = [' '.join(map(str, x)) for x in df["lemmas"]]
    df['text_lemma'] = df['text_lemma'].apply(remove_entities)
    df_list.append(df)
   

In [36]:
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [37]:
for i in range(len(files)):
    if findWholeWord('ebola')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_ebola)
    elif findWholeWord('mers')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_mers)
    elif findWholeWord('helicopter')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_helicopter)
    elif findWholeWord('train')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_train)

In [38]:
index_2011 = list()
index_2012 = list()
index_2013 = list()
index_2014 = list()
index_2015 = list()
index_2016 = list()
index_2017 = list()
index_2018 = list()

for i in range(len(files)):
    if findWholeWord('2011')(files[i]):
        index_2011.append(i)
    elif findWholeWord('2012')(files[i]):
        index_2012.append(i)
    elif findWholeWord('2013')(files[i]):
        index_2013.append(i)
    elif findWholeWord('2014')(files[i]):
        index_2014.append(i)
    elif findWholeWord('2015')(files[i]):
        index_2015.append(i)
    elif findWholeWord('2016')(files[i]):
        index_2016.append(i)
    elif findWholeWord('2017')(files[i]):
        index_2017.append(i)
    elif findWholeWord('2018')(files[i]):
        index_2018.append(i)   

In [39]:
df_2012 = df_list[index_2012[0]]
for i in range(1, len(index_2012)):
   df_2012 = df_2012.append(df_list[index_2012[i]], ignore_index=True)

In [40]:
df_2012.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18432 entries, 0 to 18431
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          17548 non-null  object
 1   text        18432 non-null  object
 2   relevance   18432 non-null  int64 
 3   text_clean  18432 non-null  object
 4   lemmas      18432 non-null  object
 5   text_lemma  18432 non-null  object
dtypes: int64(1), object(5)
memory usage: 864.1+ KB


In [41]:
df_2013 = df_list[index_2013[0]]
for i in range(1, len(index_2013)):
   df_2013 = df_2013.append(df_list[index_2013[i]], ignore_index=True)

In [42]:
df_2013.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70626 entries, 0 to 70625
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          70626 non-null  object
 1   text        70626 non-null  object
 2   relevance   70626 non-null  int64 
 3   text_clean  70626 non-null  object
 4   lemmas      70626 non-null  object
 5   text_lemma  70626 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.2+ MB


In [43]:
df_2014 = df_list[index_2014[0]]
for i in range(1, len(index_2014)):
   df_2014 = df_2014.append(df_list[index_2014[i]], ignore_index=True)

In [44]:
df_2014.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22496 entries, 0 to 22495
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          22496 non-null  object
 1   text        22496 non-null  object
 2   relevance   22496 non-null  int64 
 3   text_clean  22496 non-null  object
 4   lemmas      22496 non-null  object
 5   text_lemma  22496 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.0+ MB


In [45]:
df_2011 = df_list[index_2011[0]]
for i in range(1, len(index_2011)):
   df_2011 = df_2011.append(df_list[index_2011[i]], ignore_index=True)

In [46]:
df_2011.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3752 entries, 0 to 3751
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          1876 non-null   object
 1   text        3752 non-null   object
 2   relevance   3752 non-null   int64 
 3   text_clean  3752 non-null   object
 4   lemmas      3752 non-null   object
 5   text_lemma  3752 non-null   object
dtypes: int64(1), object(5)
memory usage: 176.0+ KB


In [47]:
df_2015 = df_list[index_2015[0]]
for i in range(1, len(index_2015)):
   df_2015 = df_2015.append(df_list[index_2015[i]], ignore_index=True)

In [48]:
df_2015.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9816 entries, 0 to 9815
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          9816 non-null   object
 1   text        9816 non-null   object
 2   relevance   9816 non-null   int64 
 3   text_clean  9816 non-null   object
 4   lemmas      9816 non-null   object
 5   text_lemma  9816 non-null   object
dtypes: int64(1), object(5)
memory usage: 460.2+ KB


In [50]:
df_2017 = df_list[index_2017[0]]
for i in range(1, len(index_2017)):
   df_2017 = df_2017.append(df_list[index_2017[i]], ignore_index=True)

In [51]:
df_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28004 entries, 0 to 28003
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          28004 non-null  object
 1   text        28004 non-null  object
 2   relevance   28004 non-null  int64 
 3   text_clean  28004 non-null  object
 4   lemmas      28004 non-null  object
 5   text_lemma  28004 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.3+ MB


In [52]:
df_2018 = df_list[index_2018[0]]
for i in range(1, len(index_2018)):
   df_2018 = df_2018.append(df_list[index_2018[i]], ignore_index=True)

In [53]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10592 entries, 0 to 10591
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          10592 non-null  object
 1   text        10592 non-null  object
 2   relevance   10592 non-null  int64 
 3   text_clean  10592 non-null  object
 4   lemmas      10592 non-null  object
 5   text_lemma  10592 non-null  object
dtypes: int64(1), object(5)
memory usage: 496.6+ KB


In [54]:
df_2011.to_pickle("../data/preprocess_train_dimbat_2011.pkl")

In [55]:
df_2012.to_pickle("../data/preprocess_train_dimbat_2012.pkl")

In [56]:
df_2013.to_pickle("../data/preprocess_train_dimbat_2013.pkl")

In [57]:
df_2014.to_pickle("../data/preprocess_train_dimbat_2014.pkl")

In [58]:
df_2015.to_pickle("../data/preprocess_train_dimbat_2015.pkl")

In [59]:
df_2017.to_pickle("../data/preprocess_train_dimbat_2017.pkl")

In [60]:
df_2018.to_pickle("../data/preprocess_train_dimbat_2018.pkl")