## Data Preprocessing Script

In [1]:
# Step 1: Remove stop words and punctuation (done)
# Step 2: Check if text has line breaks and remove those (done)
# Step 3: Separate words like I'm to I am (done)
# Step 4: Remove names, dates, etc. (named entities)
# Step 5: Tokenize the text (done)
# Step 6: Change the text to lower case (done)
# Step 7: Lemmatize the words (done)

# REMOVE PERSON, LOC, ORG, GPE

In [2]:
import pandas as pd
import numpy as np

import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [4]:
# Function adapted from `preprocess` function shared by Varada
# in course 575 Advance Learning Machine

def preprocess_comments(text, 
                        min_token_len = 2, 
                        irrelevant_pos = ['PRON', 'SPACE', 'PUNCT', 'ADV', 
                                          'ADP', 'CCONJ', 'AUX', 'PRP'],
                        avoid_entities = ['PERSON', 'ORG', 'LOC', 'GPE']):
# note: Didn't use the following options in the `preprocess_comments`...
#    - 'PROPN' because it erases proper names as 'George', but also words as orange.
#    - 'DET' since it removes the word 'no', which changes the meaning of a sentence.
# *for more information see link: https://universaldependencies.org/u/pos/
    """
    Given text, min_token_len, irrelevant_pos and avoid_entities, carries out 
    preprocessing of the text and returns list of preprocessed text. 
    
    Parameters
    -------------
    text : (list) 
        the list of text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    avoid_entities : (list)
        a list of entity labels to be avoided
    
    Returns
    -------------
    (list) list of preprocessed text
    
    Example
    -------------
    >>> example = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

    >>> preprocess(example)
    (output:) ['hello love swimming', 'good cook', 'contact']
    """

    result = []
    
    others = ["'s", "the", "that", "this", "to", "-PRON-"]
    # I add "-PRON-" that erase "my", "your", etc. other way to erase them is to
    #   use adding 'DET' to irrelevant_pos but it would erase the word 'no' too.
    
    for sent in text:
        sent = sent.lower()
        sent = re.sub(r"facebook", "social media", sent)
        sent = re.sub(r"twitter", "social media", sent)
        sent = re.sub(r"instagram", "social media", sent)
        sent = re.sub(r"whatsapp", "social media", sent)
        sent = re.sub(r"linkedin", "social media", sent)
        sent = re.sub(r"snapchat", "social media", sent)
        
        result_sent = []
        doc = nlp(sent)
        entities = [str(ent) for ent in doc.ents if ent.label_ in avoid_entities]
        # This helps to detect names of persons, organization and dates
        
        for token in doc:            
            if (token.like_email or
                token.like_url or
                token.pos_ in irrelevant_pos or
                str(token) in entities or
                str(token.lemma_) in others or
                len(token) < min_token_len):
                continue
            else:
                result_sent.append(token.lemma_)
        result.append(" ".join(result_sent))
    return result

In [5]:
example_1 = ["Why I got this information from Facebook today?", "News from Whatsapp isn't reliable",
          "Do you eat apples?", "Apple is a profitable company", 
          "Twitter is a great source of information!!!",
          "Sukriti, Carlina, Victor and Karan, are a great team!", "Steve's computer's are great!!",
          "Isn't it a great day, Bob? I loved the movie and spending time with you.", 
          "The sky is not always blue underneath. Remember that."]

In [6]:
preprocess_comments(example_1)

['get information social medium today',
 'news social medium not reliable',
 'eat apple',
 'profitable company',
 'social media great source information',
 'great team',
 'computer great',
 'not great day love movie spending time',
 'not blue remember']

In [7]:
example_2 = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

In [8]:
preprocess_comments(example_2)

['hello love swimming', 'good cook', 'contact']

### Preprocess BC Stats datasets
*This code preprocess the "Comments" who have sensible data.*

In [9]:
# Note: The next code code consider that your datasets 
#   are located in the "../data/" directory.

##################
### Question 1 ###
##################

# train dataset
X_train = pd.read_csv("../data/X_train.csv")
X_train_pp = preprocess_comments(X_train['Comment'])

# validation dataset
X_valid = pd.read_csv("../data/X_valid.csv")
X_valid_pp = preprocess_comments(X_valid['Comment'])


##################
### Question 2 ###
##################

### Supervised:

# train dataset
X_train_q2 = pd.read_csv("../data/X_train_q2.csv")
X_train_q2_pp = preprocess_comments(X_train_q2['Comment'])

# validation dataset
X_valid_q2 = pd.read_csv("../data/X_valid_q2.csv")
X_valid_q2_pp = preprocess_comments(X_valid_q2['Comment'])


### Unsupervised:
unsuperv_q2 = pd.read_csv("../data/unsuperv_q2.csv")
unsuperv_q2_pp = preprocess_comments(X_train_q2['Comment'])

####################################
### Saving Preprocessed datasets ###
####################################

pd.DataFrame(X_train_pp, columns=['Comment']).to_csv('../data/X_train_pp.csv', index=False)
pd.DataFrame(X_valid_pp, columns=['Comment']).to_csv('../data/X_valid_pp.csv', index=False)
pd.DataFrame(X_train_q2_pp, columns=['Comment']).to_csv('../data/X_train_q2_pp.csv', index=False)
pd.DataFrame(X_valid_q2_pp, columns=['Comment']).to_csv('../data/X_valid_q2_pp.csv', index=False)
pd.DataFrame(unsuperv_q2_pp, columns=['Comment']).to_csv('../data/unsuperv_q2_pp.csv', index=False)