<a href="https://colab.research.google.com/github/rdemarqui/sentiment_analysis/blob/main/01%20Text%20Preprocessing%20v0.1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis
## Text Preprocessing

In [1]:
import pandas as pd
import string, os

In [3]:
path = os.getcwd()
path

'/content'

### Load data

In [4]:
# Open dataframe
columns = ['review_text', 'overall_rating', 'recommend_to_a_friend']

try:
  df = pd.read_csv(os.path.join(path, 'data\B2W-Reviews01.zip'),
                   sep=',', compression='zip', usecols=columns)
except:
  !wget https://github.com/rdemarqui/sentiment_analysis/raw/main/data/B2W-Reviews01.zip -q
  df = pd.read_csv(os.path.join(path, 'B2W-Reviews01.zip'),
                   sep=',', compression='zip', usecols=columns)

print(df.shape)
df.head()

(132373, 3)


Unnamed: 0,overall_rating,recommend_to_a_friend,review_text
0,4,Yes,Estou contente com a compra entrega rápida o ú...
1,4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ..."
2,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...
3,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...
4,5,Yes,"A entrega foi no prazo, as americanas estão de..."


### Normalization

In [5]:
corpus = df.query("overall_rating >= 4 or overall_rating <=2").reset_index(drop=True).copy()
print(corpus.shape)
corpus.head()

(116058, 3)


Unnamed: 0,overall_rating,recommend_to_a_friend,review_text
0,4,Yes,Estou contente com a compra entrega rápida o ú...
1,4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ..."
2,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...
3,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...
4,5,Yes,"A entrega foi no prazo, as americanas estão de..."


In [6]:
punctuations = list(string.punctuation)
def remove_punctuation(text):
    for item in punctuations:
        text = text.replace(item, " ")
    return text

In [7]:
accentuation = {
    "á": "a", "ã": "a", "à": "a","â": "a",
    "é": "e","ê": "e",
    "í": "i",
    "ó": "o","õ": "o", "ô":"o",
    "ú": "u",
    "ç": "c"
    }
def remove_accentuation(text):
    for item in accentuation.items():
        text = text.replace(str(item[0]), str(item[1]))
    return text

In [8]:
special_charaters = [
    "1","2","3","4","5","6","7","8","9","0",
    " a "," b "," c "," d "," e "," f "," g "," h "," i "," j "," k "," l ",
    " m "," n "," o "," p "," q "," r "," s "," t "," u "," v "," x "," z ",
    "r$", "$"
    ]
def remove_special_characters(text):
    for item in special_charaters:
        text = text.replace(item, " ")
    return text

In [9]:
def normalize_text(df, text_field, lower=True, rem_punct=True, rem_accent=True, rem_spec_caract=True, rem_name=False):
    df[text_field + "_clean"] = df[text_field].astype(str)
    if lower: df[text_field + "_clean"] = df[text_field + "_clean"].str.lower()
    if rem_punct: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_punctuation)
    if rem_accent: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_accentuation)
    if rem_spec_caract: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_special_characters)
    df[text_field + "_clean"] = df[text_field + "_clean"].replace(r'\s+', ' ', regex=True) #remove spaces

In [10]:
%%timeit
normalize_text(corpus, 'review_text')
#corpus.drop(columns=['review_text'], inplace=True)

3.34 s ± 530 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Stop Words

In [11]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
stops_nltk = nltk.corpus.stopwords.words('portuguese')
stop_words = list(stops_nltk)
stop_words = [remove_accentuation(word) for word in stop_words]
stop_words = list(set(stop_words))

def remove_stop_words(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in stop_words]
    text = ' '.join(words_list)

    return text

In [14]:
%%timeit
corpus["review_text_clean_stop"] = corpus["review_text_clean"].apply(remove_stop_words)

6.29 s ± 766 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Stematization

In [15]:
nltk.download('rslp')

[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Unzipping stemmers/rslp.zip.


True

In [16]:
stemmer = nltk.stem.RSLPStemmer()
def stematization(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [17]:
%%timeit
corpus["review_text_clean_stem"] = corpus["review_text_clean"].apply(stematization)

KeyboardInterrupt: ignored

### Lemmatization

https://spacy.io/usage/processing-pipelines#processing

In [15]:
import spacy.cli

language_models = ['pt_core_news_sm', 'pt_core_news_md', 'pt_core_news_lg']
spacy_model = language_models[1]

disable = ['tagger','parser','ner','entity_ruler','entity_linker','textcat']

try:
  nlp = spacy.load(spacy_model, disable=disable)
except:
  spacy.cli.download(spacy_model)
  nlp = spacy.load(spacy_model, disable=disable)

print(nlp.pipe_names)

['tok2vec', 'morphologizer', 'lemmatizer', 'attribute_ruler']


In [16]:
def lemmatization(text):
    doc = nlp(text)
    lema_words = [token.lemma_ for token in doc]
    return ' '.join(lema_words)

In [18]:
%%timeit
corpus["review_text_clean_lema"] = corpus.iloc["review_text_clean"].apply(lemmatization)

TypeError: ignored

### Hapax Legomenon

In [None]:
full_text = ' '.join(corpus['review_text_clean'].tolist())
words = full_text.split()
vocabulary = set(words)
print(f'Vocabulary size:{len(vocabulary)}')

In [None]:
# Frequency distribution
from nltk.probability import FreqDist

fdist = FreqDist(words)
word_freq = pd.DataFrame(fdist.most_common(len(vocabulary)), columns=['Word', 'Count'])
word_freq.tail()

In [None]:
hapax_list = list(word_freq[word_freq['Count']<=1]['Word'])

def hapax(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in hapax_list]
    text = ' '.join(words_list)

print(f'Quantity hapax words {len(hapax_list)}')

In [None]:
#%%timeit
#corpus["review_text_clean_hapax"] = corpus["review_text_clean"].apply(hapax)

## Save Data

In [None]:
corpus.to_csv('preprocessed_corpus.csv', sep=';', index=False)