In [135]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup    #html tags
import unidecode                 #accented chars
import contractions              #expand contractions
import spacy                     #Preprocessing   # !python -m spacy download en_core_web_sm

import nltk
from nltk.tokenize import word_tokenize #tokenization # nltk.download('punkt')
from nltk.corpus import stopwords       #Stopwords  # nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer #Lemmatizing # nltk.download('wordnet')
from nltk.stem import PorterStemmer     #Stemming

## Read Data from File

In [3]:
file_path = os.path.join(os.getcwd(), '..', 'Data','1_Clean.csv')
df = pd.read_csv(file_path)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,"Unique designs This book has beautiful photos,...",positive
1,Great Book Loved their approach in this book a...,positive
2,Five Stars great,positive
3,"Great Book! Always love the way Eva thinks, an...",positive
4,Five Stars Nice patterns,positive


In [247]:
df['sentiment'].value_counts()

sentiment
positive    7400867
negative    1870734
Name: count, dtype: int64

More positive reviews compared to the negative ones

## Text preprocessing
1. Lowercasing - can also use uppercasing
2. HTML Tags - removing html tags if review contains any such tags while scrapping the reviews
3. Contractions - expanding contractions - don't to do not
4. Word to Numbers - Converting words into numbers - One to 1
5. Remove Digits - Removing any digits
6. Accented Characters - Removing any accented characters - Boutiqué to Boutique
7. Special Characters - Removing any non-alphanumeric characters - includes punctuation, special characters or emoticons
8. Whitespaces - Removes extra white spaces from whole review after all above updations

Can directly use the functions on the reviews to update them 

In [93]:
def lowercasing(text):
    ## changing all words to lowercase
    return text.lower()

In [92]:
def remove_html_tags(text):
    ## Remove any html tags from reviews using BeautifulSoup and lxml parser
    soup = BeautifulSoup(text, 'lxml')
    return soup.get_text(separator=" ")

In [91]:
def expand_contractions(text):      #https://medium.com/@lukei_3514/dealing-with-contractions-in-nlp-d6174300876b
    ## expand contractions and abbrevations like don't to do not
    return contractions.fix(text)

In [90]:
word_2_number_dict = {'zero': '0', 'one': '1', 'two': '2',   'three': '3', 'four': '4',
                      'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9'}

def word_2_number(text):
    ## Convert words into numbers like one to 1
    text = text.split()
    return ' '.join(word_2_number_dict.get(word, word) for word in text)

In [89]:
def remove_digits(text):
    return ''.join(word for word in text if not word.isdigit())

In [88]:
def remove_accented_characters(text):
    ## remove accented characters from review like Boutiqué or using unicode module
    return unidecode.unidecode(text)

In [87]:
def remove_special_characters(text):
    ## remove non-alphanumeric (\w - [a-zA-Z0-9_], \s - spaces) 
    return re.sub(r'[^\w\s]', '', text)

In [86]:
def remove_whitespaces(text):
    ## remove extra whitespaces 
    return ' '.join(text.strip().split())

### Function Summary: Tokenization, Stopword Removal, Stemming, and Lemmatization

Functions for both `nltk` and `spacy` use any one
1. tokenization
2. stopwords removal - ignored (no, nor, not) for preserving meaning of negative comments
3. stemming - using PortStemmer()
4. lemmatization - using WordNetLemmatizer() or spacy.lemma_

If using both stemming and lemmatization for spacy then make sure to uncomment the commented code in the last function (lemmatization if performing stemming and then lemmatization and stemming if performing lemmatization and then stemming) as spacy requires the doc datatype for processing

In [182]:
def create_tokens_nltk(text):
    ## Create Tokens using nltk. 
    # nltk.download('punkt')
    return nltk.word_tokenize(text)

In [197]:
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")  #Spacy pretrained model for English

def create_tokens_spacy(text):
    # create doc type tokens using spacy
    return nlp(text)

In [184]:
stop_words = set(stopwords.words("english")) - {'no', 'nor', 'not'}

def remove_stopwords_nltk(tokens):
    ## removing stopwords using nltk for nltk tokens
    # nltk.download('stopwords')
    return [token for token in tokens if token not in stop_words]

In [185]:
stop_words = set(stopwords.words("english")) - {'no', 'nor', 'not'}

def remove_stopwords_spacy(tokens):
    ## removing stopwords using nltk for spacy doc tokens
    # nltk.download('stopwords')
    return [token for token in tokens if token.text not in stop_words]

In [186]:
stemmer = PorterStemmer()

def stemming_nltk(tokens):
    ## Stemming using nltk.PorterStemmer for nltk tokens
    return [stemmer.stem(token) for token in tokens]

In [187]:
stemmer = PorterStemmer()

def stemming_spacy(tokens):
    ## Stemming using nltk.PorterStemmer for spacy doc tokens
    # Uncomment below line only when using both lemmatization and then stemming
    # tokens = nlp(' '.join(token for token in tokens))  #creating doc type. if to be used after lemmatizing as .text requires doc datatype
    return [stemmer.stem(token.text) for token in tokens]

In [188]:
lemmatizer = WordNetLemmatizer()

def lemmatization_nltk(text):
    ## Lemmatization using nltk.WordNetLemmatizer
    # nltk.download('wordnet')    
    return [lemmatizer.lemmatize(token) for token in text]

In [225]:
def lemmatization_spacy(tokens):
    ## Lemmatization using spacy
    # Uncomment below line only when using both stemming and then lemmatization 
    tokens = nlp(' '.join(token for token in tokens))  #creating doc type. if to be used after stemming as .lemma_ requires doc datatype
    return [token.lemma_ for token in tokens]

## Applying all the Text preprocessing steps

In [202]:
df['review'] = df['review'].apply(lowercasing)

In [204]:
df['review'] = df['review'].apply(remove_html_tags)

  soup = BeautifulSoup(text, 'lxml')


In [205]:
df['review'] = df['review'].apply(expand_contractions)

In [207]:
df['review'] = df['review'].apply(word_2_number)

In [209]:
df['review'] = df['review'].apply(remove_digits)

In [211]:
df['review'] = df['review'].apply(remove_accented_characters)

In [212]:
df['review'] = df['review'].apply(remove_special_characters)

In [213]:
df['review'] = df['review'].apply(remove_whitespaces)

## Using Spacy for Tokenization, Stopwords removal, Stemming and Lemmatization

In [221]:
def process_batch(batch):
    batch['review'] = batch['review'].apply(create_tokens_spacy)
    batch['review'] = batch['review'].apply(remove_stopwords_spacy)
    batch['review'] = batch['review'].apply(stemming_spacy)
    # batch['review'] = batch['review'].apply(lemmatization_spacy)
    return batch

In [244]:
from tqdm import tqdm

batch_size = 1000
batch_from = 10000
batch_to = len(df)

tqdm.pandas(desc="Processing")
for i in tqdm(range(batch_from, batch_to, batch_size)):
    batch = df.iloc[i:i+batch_size].copy()
    df.iloc[i:i+batch_size] = process_batch(batch)

100%|███████████████████████████████████████████████████████████████████████████| 9262/9262 [31:35:10<00:00, 12.28s/it]


In [245]:
df.iloc[10000]

review       [great, qualiti, hard, argu, convers, qualiti,...
sentiment                                             positive
Name: 10000, dtype: object

In [252]:
df['review'] = df['review'].apply(lambda words: ' '.join(words))

In [268]:
df.head()

Unnamed: 0,review,sentiment
0,uniqu design book beauti photo good understand...,positive
1,great book love approach book paperback easi u...,positive
2,star great,positive
3,great book alway love way eva think fun design...,positive
4,star nice pattern,positive


## Saving the PreProcessed data into a file

In [256]:
file_path = os.path.join(os.getcwd(), '..', 'Data','2_PreProcessed.csv')
df.to_csv(file_path, index=False)