## Data Preparation
#### First we analyze the data:
1. Search for missing values
2. Check dataset balance
#### Then we use NLP techniques such as:
1. Stemming
2. Tokenization
3. Stop-word removal

In [None]:
# Imports
import pandas as pd
import seaborn as sns

from phrase_breaker import phrase_breaker

## Data Analysis

In [None]:
# Import the data onto dataframe
data = pd.read_csv("../reviews.csv", names=["Review Text", "Stars", "Polarity"])
data

In [None]:
# Look for null values
data.isnull().values.any()

In [None]:
# Check dataset balance
sns.countplot(x="Polarity", data=data)

In [None]:
# Creating dataframe with original vocabulary (existent words)
# Mainly for performance evaluation purposes

original = set()
data["Review Text"].str.split().apply(original.update)

original_data = {
    'Number of Words': [len(list(original))],
    'Type': ["original"],
}

pf_df = pd.DataFrame(original_data)
pf_df

In [None]:
# Word Separation
# Compare the number of words after using a phrase breaker
benchmark_df = data.copy()

word_sep = set()
benchmark_df["Review Text"].apply(phrase_breaker).str.split().apply(word_sep.update)

word_sep_data = {
    'Number of Words': len(list(word_sep)),
    'Type': "word_separation",
}

pf_df = pf_df.append(word_sep_data, ignore_index=True)

sns.barplot(x="Type", y="Number of Words", data=pf_df)

## Data Preparation

In [None]:
# Applying phrase breaker to reviews
data["Review Text"] = data["Review Text"].apply(phrase_breaker)
data["Review Text"].head()