## Data Preprocessing Script

In [1]:
# Step 1: Remove stop words and punctuation (done)
# Step 2: Check if text has line breaks and remove those (done)
# Step 3: Separate words like I'm to I am (done)
# Step 4: Remove names, dates, etc. (named entities)
# Step 5: Tokenize the text (done)
# Step 6: Change the text to lower case (done)
# Step 7: Lemmatize the words (done)

# REMOVE PERSON, LOC, ORG, GPE

In [13]:
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize, WordNetLemmatizer, sent_tokenize, pos_tag
from nltk.corpus import stopwords as sw, wordnet as wn

import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
import spacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()


In [14]:
class comment_preprocessor:
    
    def __init__(self):
        self.stop_words = list(set(stopwords.words('english')))
        punctuation = string.punctuation
        self.stop_words += list(punctuation)
        self.stop_words.extend(['``','’', '`','br','"',"”", "''", "'s"])
    
    def lemmatize(self, token, tag):
        """
        Converts NLTK tag to a WordNet POS tag, then uses that   
        tag to perform an accurate WordNet lemmatization.
        """
        tag = tag.lower()

        if str.startswith(tag, 'v'):
            tag = wn.VERB
        elif str.startswith(tag, 'j'):
            tag = wn.ADJ
        elif str.startswith(tag, 'r'):
            tag = wn.ADV
        else:
            tag = wn.NOUN
        
        return WordNetLemmatizer().lemmatize(token, tag)
    
    def preprocess_text(self, document_list):
        """
        Preprocesses list of documents by reomving stop words, tokenizing, 
        lemmatizing, and converting short word forms into complete words.
        """
        cleaned_document = []
        vocab = []

        # For each document inside the corpus
        for sent in document_list:

            sent = sent.lower()
            sent = re.sub(r"what's", "what is ", sent)
            sent = re.sub(r"\'ve", " have ", sent)
            sent = re.sub(r"can't", "cannot ", sent)
            sent = re.sub(r"n't", "not ", sent)
            sent = re.sub(r"i'm", "i am ", sent)
            sent = re.sub(r"\'re", " are ", sent)
            sent = re.sub(r"\'d", " would ", sent)
            sent = re.sub(r"\'ll", " will ", sent)
            sent = sent.replace("\n", " ")
            sent = sent.replace(r'<br />',' ')

            lemmatized_tokens = []

            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):

                # Apply preprocessing to the tokens
                token = token.strip()

                if token in self.stop_words:
                    continue

                # Lemmatize the token
                lemma = self.lemmatize(token, tag)
                lemmatized_tokens.append(lemma)
                vocab.append(lemma)

            cleaned_document.append(lemmatized_tokens)

        vocab = sorted(list(set(vocab)))

        return cleaned_document, vocab

In [15]:
corpus = ["It isn't a great day. I loved the movie and spending time with you.", 
              "The sky is not always blue underneath. Remember that."]

In [16]:
c_pp = comment_preprocessor()
clean_doc_2, vocab_2 = c_pp.preprocess_text(corpus)

In [17]:
clean_doc_2

[['isnot', 'great', 'day', 'love', 'movie', 'spending', 'time'],
 ['sky', 'always', 'blue', 'underneath', 'remember']]

In [18]:
## Usage example with training data

# clean_doc, vocab = c_pp.preprocess_text(list(X_train_q1['Comment']))

In [None]:
## For TFIDF, convert list of lists into list of sentences using the following code

# final_comments = []
# for docs in clean_doc_2:
#         final_comments.append(' '.join(docs))