<a href="https://colab.research.google.com/github/rdemarqui/sentiment_analysis/blob/main/01%20Text%20Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis
## Text Preprocessing

In [1]:
import pandas as pd
import string, os

In [3]:
import spacy.cli

disable = ['tagger','parser','ner','entity_ruler','entity_linker','textcat']

try:
  nlp = spacy.load('pt_core_news_lg', disable=disable)
except:
  !python -m spacy download pt_core_news_lg
  nlp = spacy.load('pt_core_news_lg', disable=disable)

Collecting pt-core-news-lg==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_lg-3.6.0/pt_core_news_lg-3.6.0-py3-none-any.whl (568.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m568.2/568.2 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pt-core-news-lg
Successfully installed pt-core-news-lg-3.6.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_lg')


In [4]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
path = os.getcwd()
path

'/content'

### Load data

In [11]:
# Open dataframe
columns = ['review_text', 'overall_rating', 'recommend_to_a_friend']

try:
  df = pd.read_csv(os.path.join(path, 'data\B2W-Reviews01.zip'),
                   sep=',', compression='zip', usecols=columns)
except:
  !wget https://github.com/rdemarqui/sentiment_analysis/raw/main/data/B2W-Reviews01.zip
  df = pd.read_csv(os.path.join(path, 'B2W-Reviews01.zip'),
                   sep=',', compression='zip', usecols=columns)

print(df.shape)
df.head()

--2023-08-20 19:28:49--  https://github.com/rdemarqui/sentiment_analysis/raw/main/data/B2W-Reviews01.zip
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/rdemarqui/sentiment_analysis/main/data/B2W-Reviews01.zip [following]
--2023-08-20 19:28:50--  https://raw.githubusercontent.com/rdemarqui/sentiment_analysis/main/data/B2W-Reviews01.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 20024576 (19M) [application/zip]
Saving to: ‘B2W-Reviews01.zip.1’


2023-08-20 19:28:50 (214 MB/s) - ‘B2W-Reviews01.zip.1’ saved [20024576/20024576]

(132373, 3)


Unnamed: 0,overall_rating,recommend_to_a_friend,review_text
0,4,Yes,Estou contente com a compra entrega rápida o ú...
1,4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ..."
2,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...
3,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...
4,5,Yes,"A entrega foi no prazo, as americanas estão de..."


### Normalization

In [12]:
corpus = df.query("overall_rating >= 4 or overall_rating <=2").reset_index(drop=True).copy()
print(corpus.shape)
corpus.head()

(116058, 3)


Unnamed: 0,overall_rating,recommend_to_a_friend,review_text
0,4,Yes,Estou contente com a compra entrega rápida o ú...
1,4,Yes,"Por apenas R$1994.20,eu consegui comprar esse ..."
2,4,Yes,SUPERA EM AGILIDADE E PRATICIDADE OUTRAS PANEL...
3,4,Yes,MEU FILHO AMOU! PARECE DE VERDADE COM TANTOS D...
4,5,Yes,"A entrega foi no prazo, as americanas estão de..."


In [13]:
punctuations = list(string.punctuation)
def remove_punctuation(text):
    for item in punctuations:
        text = text.replace(item, " ")
    return text

In [14]:
accentuation = {
    "á": "a", "ã": "a", "à": "a","â": "a",
    "é": "e","ê": "e",
    "í": "i",
    "ó": "o","õ": "o", "ô":"o",
    "ú": "u",
    "ç": "c"
    }
def remove_accentuation(text):
    for item in accentuation.items():
        text = text.replace(str(item[0]), str(item[1]))
    return text

In [15]:
special_charaters = [
    "1","2","3","4","5","6","7","8","9","0",
    " a "," b "," c "," d "," e "," f "," g "," h "," i "," j "," k "," l ",
    " m "," n "," o "," p "," q "," r "," s "," t "," u "," v "," x "," z ",
    "r$", "$"
    ]
def remove_special_characters(text):
    for item in special_charaters:
        text = text.replace(item, " ")
    return text

In [16]:
def normalize_text(df, text_field, lower=True, rem_punct=True, rem_accent=True, rem_spec_caract=True, rem_name=False):
    df[text_field + "_clean"] = df[text_field].astype(str)
    if lower: df[text_field + "_clean"] = df[text_field + "_clean"].str.lower()
    if rem_punct: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_punctuation)
    if rem_accent: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_accentuation)
    if rem_spec_caract: df[text_field + "_clean"] = df[text_field + "_clean"].apply(remove_special_characters)
    df[text_field + "_clean"] = df[text_field + "_clean"].replace(r'\s+', ' ', regex=True) #remove spaces

In [17]:
normalize_text(corpus, 'review_text')
corpus.drop(columns=['review_text'], inplace=True)

### Stop Words

In [18]:
stops_nltk = nltk.corpus.stopwords.words('portuguese')
stops_spacy = nlp.Defaults.stop_words
stop_words_join = set(stops_nltk) | stops_spacy
print(f"stop_nltk: {len(stops_nltk)}\nstop_spacy: {len(stops_spacy)}\nstop_join: {len(stop_words_join)}")

stop_nltk: 207
stop_spacy: 416
stop_join: 500


In [19]:
stop_words = list(stops_nltk)
stop_words = [remove_accentuation(word) for word in stop_words]
stop_words = list(set(stop_words))

def remove_stop_words(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in stop_words]
    text = ' '.join(words_list)

    return text

In [20]:
corpus["review_text_clean_stop"] = corpus["review_text_clean"].apply(remove_stop_words)

### Lemmatization

In [21]:
def lemmatization(text):
    doc = nlp(text)
    lema_words = [token.lemma_ for token in doc]
    return ' '.join(lema_words)

In [None]:
corpus["review_text_clean_lema"] = corpus["review_text_clean"].apply(lemmatization)

### Stematization

In [None]:
nltk.download('rslp')

In [None]:
stemmer = nltk.stem.RSLPStemmer()
def stematization(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [None]:
corpus["review_text_clean_stem"] = corpus["review_text_clean"].apply(stematization)

In [None]:
corpus

### Hapax Legomenon

In [None]:
full_text = ' '.join(corpus['review_text_clean'].tolist())
words = full_text.split()
vocabulary = set(words)
print(f'Vocabulary size:{len(vocabulary)})

In [None]:
# Frequency distribution
from nltk.probability import FreqDist

fdist = FreqDist(words)
word_freq = pd.DataFrame(fdist.most_common(len(vocabulary)), columns=['Word', 'Count'])
word_freq.tail()

In [None]:
hapax_list = list(word_freq[word_freq['Count']<=1]['Word'])

def hapax(text):
    words_list = text.split()
    words_list = [word for word in words_list if word not in hapax_list]
    text = ' '.join(words_list)

print(f'Quantity hapax words {len(hapax_list)}')

In [None]:
#corpus["review_text_clean_hapax"] = corpus["review_text_clean"].apply(hapax)

## Save Data

In [None]:
corpus.to_csv('preprocessed_corpus.csv', sep=';', index=False)