## Loading Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ujson as json


import string
import re

from textatistic import Textatistic

import spacy

import warnings

warnings.filterwarnings('ignore')

RSEED = 42


## Data cleaning

In [2]:
nlp = spacy.load('en_core_web_lg')
stopwords = spacy.lang.en.STOP_WORDS

In [3]:
def preprocess(text):
    doc = nlp(text, disable=['ner', 'parser'])
    lemmas = [token.lemma_ for token in doc]
    a_lemmas = [lemma for lemma in lemmas 
              if lemma.isalpha() and lemma not in stopwords]
    return a_lemmas

In [4]:
def remove_entities(text):
    doc = nlp(text)
    return(" ".join([ent.text for ent in doc if not ent.ent_type_]))
    

In [5]:
def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)

In [6]:
def remove_ebola(text):
    words = re.compile('(\s*)ebola(\s*)')
    return words.sub(r" ", text)

In [7]:
def remove_mers(text):
    words = re.compile('(\s*)mers(\s*)')
    return words.sub(r" ", text)

In [8]:
def remove_helicopter(text):
    words = re.compile('(\s*)helicopter(\s*)')
    return words.sub(r" ", text)

In [9]:
def remove_train(text):
    words = re.compile('(\s*)train(\s*)')
    return words.sub(r" ", text)

In [10]:
import os
path = "../data_dimbat/incident-tweets/"
files = os.listdir(path)
df_list = list()
for file in files:
    records = map(json.loads, open(os.path.join(path, file), encoding="utf8"))
    df = pd.DataFrame.from_records(records)
    df["text_clean"] = df["text"].apply(lambda x: remove_html(x))
    df['lemmas'] = df['text_clean'].apply(preprocess)   
    df["text_lemma"] = [' '.join(map(str, x)) for x in df["lemmas"]]
    df['text_lemma'] = df['text_lemma'].apply(remove_entities)
    df_list.append(df)
   

In [11]:
def findWholeWord(w):
    return re.compile(r'\b({0})\b'.format(w), flags=re.IGNORECASE).search

In [12]:
for i in range(len(files)):
    if findWholeWord('ebola')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_ebola)
    elif findWholeWord('mers')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_mers)
    elif findWholeWord('helicopter')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_helicopter)
    elif findWholeWord('train')(files[i]):
        df_list[i]['text_lemma'] = df_list[i]['text_lemma'].apply(remove_train)

In [13]:
df_sum = df_list[0]

In [14]:
for i in range(1, len(df_list)):
   df_sum = df_sum.append(df_list[i], ignore_index=True)
    

In [15]:
df_sum.sample(20)

Unnamed: 0,id,text,relevance,text_clean,lemmas,text_lemma
93001,690722254292979712,"district 'aware, outraged' by racist photo all...",0,"district 'aware, outraged' by racist photo all...","[district, aware, outraged, racist, photo, all...",district aware outraged racist photo allegedly...
68827,'511452413757108224',=net news today malta ship sinks <NUMBER> dead...,1,=net news today malta ship sinks dead=logic ...,"[net, news, today, malta, ship, sink, dead, lo...",net news ship sink dead logic webdic nd c star...
97056,'347945641789497344',looks like there's a lot of power outages. one...,1,looks like there's a lot of power outages. one...,"[look, like, lot, power, outage, inglewood, ar...",look like lot power outage inglewood area blac...
10701,'497087009736126464',i've been telling peole for <NUMBER> years tha...,1,i've been telling peole for years that afric...,"[I, tell, peole, year, africans, evolve, ebola...",I tell peole year evolve dyingindessert starva...
66693,442165443109720064,<USER> hi sun extra.quick q-y has the ind elec...,0,hi sun extra.quick q-y has the ind elect medi...,"[hi, sun, q, y, ind, elect, medium, carry, sto...",hi sun q y ind elect medium carry story tata t...
90964,844247183230668800,<USER> @nicolasturgeon hear that? that's the s...,0,@nicolasturgeon hear that? that's the silent ...,"[hear, silent, majority, bide, time]",hear silent majority bide time
143306,'262593874428571649',i wish i would've been the one to create the <...,1,i wish i would've been the one to create the ...,"[I, wish, I, create, sandy, novelty, account]",I wish I create sandy novelty account
80127,788503477899169792,been dancing around to the <HASHTAG> tejanoque...,0,been dancing around to the tejanoqueen all ni...,"[dance, tejanoqueen, night, emotion, right, lo...",dance emotion right love
121614,907943262174040064,<HASHTAG> foxalgeria 'the nation' compares cli...,1,foxalgeria 'the nation' compares climate chan...,"[foxalgeria, nation, compare, climate, change,...",foxalgeria nation compare climate change denie...
15046,'451189143637610496',my prayers goes out to u <HASHTAG> prayforchile,1,my prayers goes out to u prayforchile,"[prayer, u, prayforchile]",prayer u prayforchile


In [16]:
df_sum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163718 entries, 0 to 163717
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          160958 non-null  object
 1   text        163718 non-null  object
 2   relevance   163718 non-null  int64 
 3   text_clean  163718 non-null  object
 4   lemmas      163718 non-null  object
 5   text_lemma  163718 non-null  object
dtypes: int64(1), object(5)
memory usage: 7.5+ MB


In [17]:
df_sum.to_pickle("../data/preprocess_train_dimbat_1.pkl")

In [18]:
df_new = pd.read_pickle("../data/preprocess_train_dimbat_1.pkl")

In [19]:
df_new.sample(30)

Unnamed: 0,id,text,relevance,text_clean,lemmas,text_lemma
60037,943302861240356864,rt &amp; follow <USER> for a chance to win a <...,0,rt follow for a chance to win a exclusive t...,"[rt, follow, chance, win, exclusive, joker, bi...",rt follow chance win exclusive joker bit pop c...
83114,301914160558641152,<USER> are you stalking my twitter? 😳,0,are you stalking my twitter? 😳,"[stalk, twitter]",stalk twitter
96207,386188532437352448,<NUMBER> t one to fix deal <NUMBER> come out o...,0,t one to fix deal come out on bail d to foo...,"[t, fix, deal, come, bail, d, fool, seemandhra...",t fix deal come bail d fool seemandhra people ...
58335,693253611577577472,no need to worry about harvard in the ivy race...,0,no need to worry about harvard in the ivy race...,"[need, worry, harvard, ivy, race, season]",need worry harvard ivy race
1064,766063840480137216,<NUMBER> marleen still can't even take a compl...,0,marleen still can't even take a compliment,"[marleen, compliment]",compliment
160122,910650995415195648,"yesterday, mexico was hit by a powerful earthq...",1,"yesterday, mexico was hit by a powerful earthq...","[yesterday, mexico, hit, powerful, earthquake]",hit powerful earthquake
89702,591912766279012353,"<NUMBER> dead so far, nepalearthquake. many in...",1,"dead so far, nepalearthquake. many injured. o...","[dead, far, nepalearthquake, injure, bad, time...",dead far nepalearthquake injure bad time nation
140646,869988473419309059,misery in the <HASHTAG> rohingya <HASHTAG> ref...,1,misery in the rohingya refugee camps after c...,"[misery, rohingya, refugee, camp, cyclone, mor...",misery rohingya refugee camp cyclone mora wall...
118098,128515670081146880,it's so brisk out  <REPEAT> sheesh  <REPEAT>,0,it's so brisk out  sheesh ,"[brisk, sheesh]",brisk sheesh
99630,'350234461981519874',fire department is looking at disconnecting so...,1,fire department is looking at disconnecting so...,"[fire, department, look, disconnect, car, end,...",fire department look disconnect car end yycflo...
