# Fine-Tuning de um Modelo Transformer para Classificação de Sentimento

## Bibliotecas usadas

In [36]:
#!pip install -q numpy==1.26.2
#!pip install -q keras
#!pip install -q spacy
#!pip install -q tensorflow
#!pip install -q keras-preprocessing

In [None]:
# Imports
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.metrics import Precision, Recall, AUC
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, LearningRateScheduler, CallbackList, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
import warnings
warnings.filterwarnings('ignore')

## Dados

Dados extraidos da platoforma hugging face

https://huggingface.co/datasets/carblacac/twitter-sentiment-analysis

In [2]:
dados_treino = pd.read_csv('/home/priscila/Downloads/03-DeepL/dados/dados_treino_p2.txt', header= None, delimiter=';')
dados_treino.head()

Unnamed: 0,0,1
0,i am feeling completely overwhelmed i have two...,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your sup...,joy
3,i already feel like i fucked up though because...,anger
4,i still love my so and wish the best for him i...,sadness


In [3]:
dados_teste = pd.read_csv('/home/priscila/Downloads/03-DeepL/dados/dados_teste_p2.txt', header= None, delimiter=';')
dados_teste.head()

Unnamed: 0,0,1
0,i feel like my only role now would be to tear ...,sadness
1,i feel just bcoz a fight we get mad to each ot...,anger
2,i feel like reds and purples are just so rich ...,joy
3,im not sure the feeling of loss will ever go a...,sadness
4,i feel like ive gotten to know many of you thr...,joy


In [5]:
dados_treino = dados_treino.rename(columns={0: 'texto', 1: 'sentimento'})
dados_teste = dados_teste.rename(columns={0: 'texto', 1: 'sentimento'})

In [24]:
print(dados_treino.shape)
print(dados_teste.shape)

(16000, 2)
(2000, 2)


In [27]:
dados_treino['sentimento'].value_counts()

sentimento
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [8]:
dados_teste['sentimento'].value_counts()

sentimento
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

obs: importante que a saída dos dados de treino e teste sejam a mesma.

## Pré-processamento de dados de texto com spacy

In [11]:
#Restart kernel
#import IPython
#IPython.Application.instance().kernel.do_shutdown(True)


In [9]:
#!python -m pip install spacy -q
#!python -m spacy download en_core_web_md
#%pip install -U spacy
%pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hNote: you may need to restart the kernel to use updated packages.


In [30]:
nlp = spacy.load('en_core_web_md')

In [62]:
text = "Machine learning is the ability of a machine to improve its performance on a specific task"
doc = nlp(text)
type(doc)

spacy.tokens.doc.Doc

In [74]:
for token in doc:
    print(token.text, "-",token.pos_,"-" ,token.lemma_, "-", token.is_stop)


Machine - NOUN - machine - False
learning - NOUN - learning - False
is - AUX - be - True
the - DET - the - True
ability - NOUN - ability - False
of - ADP - of - True
a - DET - a - True
machine - NOUN - machine - False
to - PART - to - True
improve - VERB - improve - False
its - PRON - its - True
performance - NOUN - performance - False
on - ADP - on - True
a - DET - a - True
specific - ADJ - specific - False
task - NOUN - task - False


In [67]:
## Função para pré-processamento de texto
def pre_process(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [68]:
pre_process(text)

'machine learning ability machine improve performance specific task'

In [70]:
dados_treino['texto'] = dados_treino['texto'].apply(pre_process)

In [73]:
dados_teste['texto'] = dados_teste['texto'].apply(pre_process)

In [72]:
dados_treino.head(1)

Unnamed: 0,texto,sentimento
0,feel completely overwhelmed strategy help feel...,fear


A função `pre_process` recebeu um texto bruto e realizou um pré-processamento clássico de NLP usando o spaCy, transformando o texto em uma versão padronizada e mais adequada para análises posteriores. Primeiro, o texto é processado pelo pipeline nlp, que tokeniza e analisa linguisticamente o conteúdo. Em seguida, a função percorre cada token, remove palavras irrelevantes (stopwords), converte cada termo para sua forma base (lematização), padroniza para letras minúsculas e elimina espaços extras. Por fim, os tokens processados são reunidos novamente em uma única string, separada por espaços, resultando em um texto limpo, normalizado e menos ruidoso, apropriado para tarefas como vetorização, classificação ou análise exploratória de textos.