In [1]:
import os
import spacy
import string
import pandas as pd
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

# Tokenise text data

A notebook to tokenize, lemmatize and clean the text data using `spacy`.

Tokenized data will be saved in `Data/Tokenised_Text/`

Actual process:
1. Lemmatise
2. Remove stopwords and punctuation
3. Tokenise

## Data to process

In [9]:
INPUT_PATH = os.path.join(os.pardir, 'Data')
OUT_PATH = os.path.join(os.pardir, 'Data', 'Tokenised_Text')
TRAIN_FILE = 'train.csv'
TEST_FILE = 'test.csv'

train_path = os.path.join(INPUT_PATH, TRAIN_FILE)
test_path = os.path.join(INPUT_PATH, TEST_FILE)
train_out_path = os.path.join(OUT_PATH, 'tokenised_train.csv')
test_out_path = os.path.join(OUT_PATH, 'tokenised_test.csv')

## Define processing method

In [3]:
# Identify punctuation and stopwords to ignore
punctuations = string.punctuation
stopwords = list(STOP_WORDS)

# Get a spaCy model for English
parser = English()

In [4]:
# Tokenization function
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [word.lemma_.lower().strip() 
                if word.lemma_ != '-PRON-' else word.lower_ 
                for word in mytokens]   
    mytokens = [word for word in mytokens 
                if word not in stopwords 
                and word not in punctuations]  
    return ' '.join([i for i in mytokens])

In [5]:
# Overall processing function
def process(df_path):
    df = pd.read_csv(df_path)
    df.set_index('id', inplace=True)
    df['comment_text'] = df['comment_text'].apply(spacy_tokenizer)
    return df

## Process

**Note:** In final submission we cannot use a preprocessed test set - it must be done in the submission kernel

In [6]:
%%time
train_df = process(train_path)
print(train_df.shape)

(1804874, 44)
Wall time: 23min 34s


In [7]:
%%time
test_df = process(test_path)
print(test_df.shape)

(97320, 1)
Wall time: 1min 35s


## Save processed data

In [10]:
train_df.to_csv(train_out_path)
test_df.to_csv(test_out_path)