## Typical Transformer Preprocessing

In [1]:
import numpy as np 
import pandas as pd 
import re
import string

In [2]:
def remove_URL(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r' httpsmark ', text)


def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)


def remove_atsymbol(text):
    name = re.compile(r'@\S+')
    return name.sub(r' atsymbol ', text)


def remove_hashtag(text):
    hashtag = re.compile(r'#')
    return hashtag.sub(r' hashtag ', text)


def remove_exclamation(text):
    exclamation = re.compile(r'!')
    return exclamation.sub(r' exclamation ', text)


def remove_question(text):
    question = re.compile(r'?')
    return question.sub(r' question ', text)


def remove_punc(text):
    return text.translate(str.maketrans('','',string.punctuation))


def remove_number(text):
    number = re.compile(r'\d+')
    return number.sub(r' number ', text)


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' emoji ', string)


In [3]:
train_data = pd.read_csv('../data/raw/SemEval-2020-Task5-Dataset/Subtask-1/subtask1_train.csv')
test_data = pd.read_csv('../data/raw/SemEval-2020-Task5-Dataset/Subtask-1/subtask1_test.csv')

In [None]:
train_data['sentence'] = train_data['sentence'].str.lower()
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_URL(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_html(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_atsymbol(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_hashtag(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_exclamation(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_punc(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_number(text))
train_data['sentence'] = train_data['sentence'].apply(lambda text: remove_emoji(text))

test_data['sentence']  = test_data['sentence'].str.lower()
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_URL(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_html(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_atsymbol(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_hashtag(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_exclamation(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_punc(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_number(text))
test_data['sentence']  = test_data['sentence'].apply(lambda text: remove_emoji(text))

In [None]:
train_data.to_csv('../data/processed/subtask1_train.csv', index=False)
test_data.to_csv('../data/processed/subtask1_train.csv', index=False)