## Data Preparation
#### First we analyze the data:
1. Search for missing values
2. Check dataset balance
#### Then we use NLP techniques such as:
1. Stemming
2. Tokenization
3. Stop-word removal

In [None]:
# Imports
import pandas as pd
import seaborn as sns

from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from matplotlib import pyplot
from utils.phrase_breaker import phrase_breaker
from collections import Counter

## Data Analysis

In [None]:
# Import the data onto dataframe
data = pd.read_csv("data/reviews.csv", names=["Review Text", "Stars", "Polarity"])
data

In [None]:
# Look for null values
data.isnull().values.any()

In [None]:
# Check dataset balance
sns.countplot(x="Stars", data=data)

In [None]:
# K most frequent words
counter = Counter(" ".join(data["Review Text"]).split())
most_occur = counter.most_common(25)
words, freq = zip(*most_occur)

freq_words = pd.DataFrame({'Frequency': freq, 'Word': words})

fig, ax = pyplot.subplots(figsize=(15.0, 5.0))
sns.lineplot(data=freq_words, x="Word", y="Frequency", ax=ax)

In [None]:
# Creating dataframe with original vocabulary (existent words)
# Mainly for performance evaluation purposes

original_set = set()
data["Review Text"].str.split().apply(original_set.update)

original_data = {
    'Number of Words': [len(list(original_set))],
    'Type': ["original"],
}

pf_df = pd.DataFrame(original_data)
pf_df

## Data Preparation

In [None]:
# Applying phrase breaker to reviews
data["Review Text"] = data["Review Text"].apply(phrase_breaker)
data["Review Text"].head()

In [None]:
# Evaluate the decrease in complexity levels after separating every word in every review
word_sep_set = set()
data["Review Text"].str.split().apply(word_sep_set.update)

word_sep_data = {
    'Number of Words': len(list(word_sep_set)),
    'Type': "word_separation",
}

pf_df = pf_df.append(word_sep_data, ignore_index=True)

sns.barplot(x="Type", y="Number of Words", data=pf_df)

In [None]:
# Tokenize all reviews for pre-processing purposes
data["Review Text"] = data["Review Text"].apply(word_tokenize)
data["Review Text"].head()

### POS Tagging

In [None]:
# POS Tagging
pos_tagged_vocab = data["Review Text"].apply(pos_tag)
pos_tagged_vocab.head()

In [None]:
def select_categories(pos_tagged_words):
    pos_tags_filter = ["JJ", "JJR", "JJS", "NN", "NNS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    selected_words = []
    for word, pos_tag in pos_tagged_words:
        if pos_tag in pos_tags_filter:
            selected_words.append(word)
    return selected_words

data["Review Text"] = pos_tagged_vocab.apply(select_categories)
data["Review Text"].head()

In [None]:
# Evaluate the decrease in complexity levels after filtering data through POS Tagging
pos_tagging_set = set()
data["Review Text"].apply(pos_tagging_set.update)

pos_tagging_data = {
    'Number of Words': len(list(pos_tagging_set)),
    'Type': "pos_tagging",
}

pf_df = pf_df.append(pos_tagging_data, ignore_index=True)

sns.barplot(x="Type", y="Number of Words", data=pf_df)

### Post-Processing with lower casing and lemmatization to reduce complexity

In [None]:
# All review text is transformed into lower case
def lower_casing(words):
    lower_case_words = []
    for word in words:
        lower_case_words.append(word.lower())
    return lower_case_words

data["Review Text"] = data["Review Text"].apply(lower_casing)
data["Review Text"].head()

In [None]:
# Evaluate the decrease in complexity levels after lower casing every review
lower_case = set()
data["Review Text"].apply(lower_case.update)

lower_case_data = {
    'Number of Words': len(list(lower_case)),
    'Type': "lower_case",
}

pf_df = pf_df.append(lower_case_data, ignore_index=True)

sns.barplot(x="Type", y="Number of Words", data=pf_df)

In [None]:
# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

def lemmatization(sentence):
    lemmatized_sentence = []

    for word in sentence:
        lemmatized_sentence.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
    
    return lemmatized_sentence

data["Review Text"] = data["Review Text"].apply(lemmatization)
data["Review Text"].head()

In [None]:
# Evaluate the decrease in complexity levels after lemmatization
lemmatization = set()
data["Review Text"].apply(lemmatization.update)

lemmatization_data = {
    'Number of Words': len(list(lemmatization)),
    'Type': "lemmatization",
}

pf_df = pf_df.append(lemmatization_data, ignore_index=True)

sns.barplot(x="Type", y="Number of Words", data=pf_df)

In [None]:
data['Review Text'] = data["Review Text"].apply(" ".join)

In [None]:
# Share data between notebooks
data.to_csv('data/filtered_reviews.csv', index=False, header=False)