## Fine-tuning a transformer model for sentiment classification

In [17]:
%reload_ext watermark
%watermark -a "postcristiano.pt"

Author: postcristiano.pt



In [3]:
!pip install -q spacy

In [4]:
!pip install -q transformers

In [16]:
# Project dependencies
import os
import math
import nltk
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import transformers
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from tensorflow import keras
from keras.utils import to_categorical
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer # OLD | from keras.preprocessing.text import Tokenizer
from keras.metrics import Precision, Recall, AUC
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, LearningRateScheduler, CallbackList, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam # OLD | from tensorflow.keras.optimizers.experimental import Adam


# Disable TensorFlow registration warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
tf.get_logger().setLevel('ERROR')

# Additional configuration to avoid general logging warnings
import logging
logging.getLogger('tensorflow').disabled = True

# Ignore specific warningss
import warnings
warnings.filterwarnings('ignore')

## Load Text Data

In [29]:
# Load train data
train_data_pt = pd.read_csv('samples/train_data.txt', header = None, delimiter = ';')

In [30]:
# Load test data
test_data_pt = pd.read_csv('samples/test_data.txt', header = None, delimiter = ';')

In [31]:
# Adjust columns
train_data_pt = train_data_pt.rename(columns = {0: 'raw_text', 1:'sentiment'})
test_data_pt = test_data_pt.rename(columns = {0: 'raw_text', 1:'sentiment'})

In [32]:
# Shape
train_data_pt.shape

(16000, 2)

In [28]:
# Shape
test_data_pt.shape

(2000, 2)

In [33]:
# Data train sample
train_data_pt.head()

Unnamed: 0,raw_text,sentiment
0,i am feeling completely overwhelmed i have two...,fear
1,i have the feeling she was amused and delighted,joy
2,i was able to help chai lifeline with your sup...,joy
3,i already feel like i fucked up though because...,anger
4,i still love my so and wish the best for him i...,sadness


In [35]:
# List sentiments in data train
train_data_pt['sentiment'].value_counts()

sentiment
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [36]:
# List sentiments in data test
test_data_pt['sentiment'].value_counts()

sentiment
joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: count, dtype: int64

## Pre-processing of Text Data with SpaCy

In [37]:
!python -m spacy download en_core_web_md -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [38]:
# Load the dictionary
pt_nlp = spacy.load('en_core_web_md')

In [39]:
# Set function 'pt_preprocessing_text' that receives a text as a parameter
def pt_preprocessing_text(text):
    
    # Process the text using the dictionary
    doc = pt_nlp(text)
    
    # Creates a list of lemmas from the tokens, converted to lowercase and without blanks, 
    # excluding words that are stopwords
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop]
    
    # returns the processed tokens as a single string, joining them with spaces
    return ' '.join(tokens)

In [42]:
# Applies function in train data
train_data_pt['processed_text'] = train_data_pt['raw_text'].apply(pt_preprocessing_text)

In [43]:
# Applies function in test data
test_data_pt['processed_text'] = test_data_pt['raw_text'].apply(pt_preprocessing_text)

In [44]:
# Data train sample
test_data_pt.head()

Unnamed: 0,raw_text,sentiment,processed_text
0,i feel like my only role now would be to tear ...,sadness,feel like role tear sail pessimism discontent
1,i feel just bcoz a fight we get mad to each ot...,anger,feel bcoz fight mad n u wanna publicity n let ...
2,i feel like reds and purples are just so rich ...,joy,feel like red purple rich kind perfect
3,im not sure the feeling of loss will ever go a...,sadness,m sure feeling loss away dull sweet feeling no...
4,i feel like ive gotten to know many of you thr...,joy,feel like ve get know comment email m apprecia...


In [22]:
%watermark -a "postcristiano.pt"

#%watermark -v -m

#%watermark --iversions

Author: postcristiano.pt

