In [1]:
# Data must be loaded in 'data' directory of the workspace
# Run `pip install emoji` in terminal for downloading emoji package used in this project

In [3]:
import pandas as pd
import os.path
import emoji
import nltk

In [4]:
# Downloading stopwords from ntlk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Shamah M
[nltk_data]     Zoha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Since the dataframe is being altered (e.g. for dropna, adding space between emoji), the warning is being disabled
pd.options.mode.chained_assignment = None

In [6]:
# Load stopwords into `words` variable for future use 
 
from nltk.corpus import stopwords

words = set(stopwords.words('english'))

In [7]:
# Define function for data cleanup
# Takes in the dataframe and the column name for which data needs to be processed
# Involves removing empty rows, add spacing between emojis (via encode_emoji) method, removing stopwords, removing unwanted characters, and dropping duplicates

def process_dataset(data, attr):
    print('Empty Row Count:', data[attr].isna().sum())
    data = drop_empty(data)
    print('Empty Row Count:', data[attr].isna().sum())

    print('Encoding Emoji')
    encode_emoji(data, attr)

    print('Removing stopwords')
    remove_stopwords(data, attr)

    print('Cleaning-up Unwanted Symbols')
    data_cleanup(data, attr)

    print('Removing potential duplicates')
    data = data.drop_duplicates()

    return data

def drop_empty(data):
    print('Dropping Empty Rows')
    data = data.dropna()
    data = data.reset_index()
    return data


def encode_emoji(data, attr):
    for idx in range(0, len(data)):
        data[attr][idx] = ''.join(' ' + e + ' ' if emoji.is_emoji(e) else e for e in data[attr][idx])


def data_cleanup(data, attr):
    data[attr] = data[attr].str.replace('\n', ' ')
    data[attr] = data[attr].str.replace('(http|https)[\\S]+', '', regex=True)
    data[attr] = data[attr].str.replace('(@\\S+)', '', regex=True)
    data[attr] = data[attr].str.replace('[^a-zA-Z\\s]', '', regex=True)
    data[attr] = data[attr].str.lower()


def remove_stopwords(df, attr):
    df[attr] = df[attr].apply(lambda t: ' '.join([w for w in t.split() if w not in words]))

In [9]:
# Load training & test data into dataframes
train_data = pd.read_csv('../data/train.En.csv')
print('Row Count:', len(train_data))

test_data_a = pd.read_csv('../data/task_A_En_test.csv')
print('Row Count:', len(test_data_a))

test_data_b = pd.read_csv('../data/task_B_En_test.csv')
print('Row Count:', len(test_data_b))

Row Count: 3468
Row Count: 1400
Row Count: 1400


In [8]:
# Process train data for Task A
train_data_a = train_data[['tweet', 'sarcastic']]
train_data_a = process_dataset(train_data_a, 'tweet')

print('=================================================')

# Process test data for Task A
print('Processing Test Data for Task A')
processed_test_data_a = test_data_a[['text', 'sarcastic']]
processed_test_data_a.rename(columns={'text': 'tweet'}, inplace=True)
processed_test_data_a = process_dataset(processed_test_data_a, 'tweet')

Empty Row Count: 1
Dropping Empty Rows
Empty Row Count: 0
Encoding Emoji
Removing stopwords
Cleaning-up Unwanted Symbols
Removing potential duplicates
Processing Test Data for Task A
Empty Row Count: 0
Dropping Empty Rows
Empty Row Count: 0
Encoding Emoji
Removing stopwords
Cleaning-up Unwanted Symbols
Removing potential duplicates


In [9]:
# Process train data for Task B
train_data_b = train_data[
    ['rephrase', 'sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question']]

train_data_b = process_dataset(train_data_b, 'rephrase')

print("=================================================")

# Process test data for Task B
processed_test_data_b = test_data_b[
    ['text', 'sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question']]
processed_test_data_b.rename(columns={'text': 'rephrase'}, inplace=True)
processed_test_data_b = process_dataset(processed_test_data_b, 'rephrase')

Empty Row Count: 2601
Dropping Empty Rows
Empty Row Count: 0
Encoding Emoji
Removing stopwords
Cleaning-up Unwanted Symbols
Removing potential duplicates
Empty Row Count: 0
Dropping Empty Rows
Empty Row Count: 0
Encoding Emoji
Removing stopwords
Cleaning-up Unwanted Symbols
Removing potential duplicates


In [10]:
# Write processed datasets to separate files in `clean_data` directory
if not os.path.exists('clean_data'):
    os.mkdir('clean_data')

train_data_a.to_csv('../clean_data/train_task_a.csv', index=False)
processed_test_data_a.to_csv('../clean_data/test_task_a.csv', index=False)

train_data_b.to_csv('../clean_data/train_task_b.csv', index=False)
processed_test_data_b.to_csv('../clean_data/test_task_b.csv', index=False)

print('File creation successful!')

File creation successful!
