# Sentiment Analysis
## Text Preprocessing

In [None]:
import pandas as pd
import string, os

In [None]:
import spacy.cli

disable = ['tagger','parser','ner','entity_ruler','entity_linker','textcat']
nlp = spacy.load('pt_core_news_lg', disable=disable)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
path = os.getcwd()
path

### Load data

In [None]:
# Open dataframe
columns = ['review_text', 'overall_rating', 'recommend_to_a_friend']
df = pd.read_csv(os.path.join(path, 'data\B2W-Reviews01.zip'),
                 sep=',', compression='zip', usecols=columns)
print(df.shape)
df.head()

### Normalization

In [None]:
corpus = df.query("overall_rating >= 4 or overall_rating <=2").reset_index(drop=True).copy()
print(corpus.shape)
corpus.head()

In [None]:
punctuations = list(string.punctuation)
def remove_punctuation(text):
    for item in punctuations:
        text = text.replace(item, " ")
    return text

In [None]:
accentuation = {
    "á": "a", "ã": "a", "à": "a","â": "a",
    "é": "e","ê": "e",
    "í": "i",
    "ó": "o","õ": "o", "ô":"o",
    "ú": "u",
    "ç": "c"
    }
def remove_accentuation(text):
    for item in accentuation.items():
        text = text.replace(str(item[0]), str(item[1]))
    return text

In [None]:
special_charaters = [
    "1","2","3","4","5","6","7","8","9","0",
    " a "," b "," c "," d "," e "," f "," g "," h "," i "," j "," k "," l ",
    " m "," n "," o "," p "," q "," r "," s "," t "," u "," v "," x "," z ",
    "r$", "$"
    ]
def remove_special_characters(text):
    for item in special_charaters:
        text = text.replace(item, " ")
    return text

In [None]:
def normalize_text(df, text_field, lower=True, rem_punct=True, rem_accent=True, rem_spec_caract=True, rem_name=False):
    df[text_field + "_clean"] = df[text_field].astype(str)
    if lower: df[text_field + "_clean"] = df[text_field + "_clean"].str.lower()
    if rem_punct: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_punctuation)
    if rem_accent: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_accentuation)
    if rem_spec_caract: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_special_characters)
    df[text_field + "_clean"] = df[text_field + "_clean"].replace(r'\s+', ' ', regex=True) #remove spaces

In [None]:
normalize_text(corpus, 'review_text')
corpus.drop(columns=['review_text'], inplace=True)

### Stop Words

In [None]:
stops_nltk = nltk.corpus.stopwords.words('portuguese')
stops_spacy = nlp.Defaults.stop_words
stop_words_join = set(stops_nltk) | stops_spacy
print(f"stop_nltk: {len(stops_nltk)}\nstop_spacy: {len(stops_spacy)}\nstop_join: {len(stop_words_join)}")

In [None]:
stop_words = list(stops_nltk)
stop_words = [remove_accentuation(word) for word in stop_words]
stop_words = list(set(stop_words))

def remove_stop_words(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in stop_words]
    text = ' '.join(words_list)

    return text

In [None]:
corpus["review_text_clean_stop"] = corpus["review_text_clean"].apply(remove_stop_words)

### Lemmatization

In [None]:
def lemmatization(text):
    doc = nlp(text)
    lema_words = [token.lemma_ for token in doc]
    return ' '.join(lema_words)

In [None]:
corpus["review_text_clean_lema"] = corpus["review_text_clean"].apply(lemmatization)

### Stematization

In [None]:
nltk.download('rslp')

In [None]:
stemmer = nltk.stem.RSLPStemmer()
def stematization(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
corpus["review_text_clean_stem"] = corpus["review_text_clean"].apply(stematization)

In [None]:
corpus

### Hapax Legomenon

In [None]:
full_text = ' '.join(corpus['review_text_clean'].tolist())
words = full_text.split()
vocabulary = set(words)
print(f'Vocabulary size:{len(vocabulary)})

In [None]:
# Frequency distribution
from nltk.probability import FreqDist

fdist = FreqDist(words)
word_freq = pd.DataFrame(fdist.most_common(len(vocabulary)), columns=['Word', 'Count'])
word_freq.tail()

In [None]:
hapax_list = list(word_freq[word_freq['Count']<=1]['Word'])

def hapax(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in hapax_list]
    text = ' '.join(words_list)
    
print(f'Quantity hapax words {len(hapax_list)}')

In [None]:
#corpus["review_text_clean_hapax"] = corpus["review_text_clean"].apply(hapax)

## Save Data

In [None]:
corpus.to_csv('corpus.csv', sep=';', index=False)