## Data Preprocessing Script

In [1]:
# Step 1: Remove stop words and punctuation (done)
# Step 2: Check if text has line breaks and remove those (done)
# Step 3: Separate words like I'm to I am (done)
# Step 4: Remove names, dates, etc. (named entities)
# Step 5: Tokenize the text (done)
# Step 6: Change the text to lower case (done)
# Step 7: Lemmatize the words (done)

# REMOVE PERSON, LOC, ORG, GPE

In [1]:
import pandas as pd
import numpy as np

import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import spacy
# from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [57]:
# function adapted from `preprocess` function of lab 3 
# of this course (575 Advance Learning Machine)

def preprocess_comments(text, 
               min_token_len = 2, 
               irrelevant_pos = [#'PROPN', # erase proper names as 'George', but also words as orange or apple
                                 'PRON',
                                 'SPACE',
                                 'PUNCT', # as:  . , ; ... " ? ! $
#                                  'DET', # commented since it removes 'no' which changes meaning of a sentence
                                 'ADV',   # removes words including 'why'
                                 'ADP',
                                 'CCONJ'
               ], avoid_entities = ['PERSON', 'ORG', 'LOC', 'GPE']): 
    """
    Given text, min_token_len, irrelevant_pos and avoid_entities, carries out 
    preprocessing of the text and returns list of preprocessed text. 
    
    Parameters
    -------------
    text : (list) 
        the list of text to be preprocessed
    min_token_len : (int) 
        min_token_length required
    irrelevant_pos : (list) 
        a list of irrelevant pos tags
    avoid_entities : (list)
        a list of entity labels to be avoided
    
    Returns
    -------------
    (list) list of preprocessed text
    
    Example
    -------------
    >>> example = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

    >>> preprocess(example)
    (output:) ['hello be love swimming', 'be good cook', 'contact']
    """

    result = []
    
    others = ["'s"]
    
    for sent in text:
        
        sent = sent.lower()
        sent = re.sub(r"facebook", "social media", sent)
        sent = re.sub(r"twitter", "social media", sent)
        sent = re.sub(r"instagram", "social media", sent)
        sent = re.sub(r"whatsapp", "social media", sent)
        sent = re.sub(r"linkedin", "social media", sent)
        sent = re.sub(r"snapchat", "social media", sent)
        
        result_sent = []
        
        doc = nlp(sent)
        entities = [str(ent) for ent in doc.ents if ent.label_ in avoid_entities] # This helps to detect names of persons, of organization and dates
        
        for token in doc:
#             print(token.pos_)
            
            if (token.like_email or
                token.like_url or
                token.pos_ in irrelevant_pos or
                str(token) in entities or       # removes names entities
                str(token.lemma_) in others or
                len(token) < min_token_len):
                continue
            else:
                result_sent.append(token.lemma_) # carries out lemmatization
    
        result.append(" ".join(result_sent))
        
    return result

In [30]:
corpus = ["Why I got this information from Facebook today", "News from Whatsapp isn't reliable",
          "Do you eat apples?", "Apple is a profitable company", 
          "Twitter is a great source of information!!!",
          "Victor's computer's are great!!", "Isn't it a great day, Bob? I loved the movie and spending time with you.", 
          "The sky is not always blue underneath. Remember that."]

In [31]:
preprocess_comments(corpus)

['get this information social medium today',
 'news social medium be not reliable',
 'do eat apple',
 'be profitable company',
 'social media be great source information',
 'computer be great',
 'be not great day love the movie spending time',
 'the be not blue remember that']

In [55]:
example = ["Hello, I'm George and I love swimming!",
                   "I am a really good cook; what about you?",
                   "Contact me at george23@gmail.com"]

In [56]:
preprocess_comments(example)

['hello be love swimming', 'be good cook', 'contact']