In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

from belt_nlp.bert_with_pooling import BertClassifierWithPooling

# Example - Model BERT with pooling

In this notebook we will show how to use basic methods `fit` and `predict` for the BERT model with pooling.

In [2]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:128
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

## Load data - sample of IMDB reviews in english

In [3]:
raw_data_path = '../data/raw/'

In [4]:
target = 'ig'

# read data
# df = pd.read_csv(
#     raw_data_path + f'r3_{target}_test_users.csv', 
#     sep = ';', 
#     encoding='utf-8-sig'
#     )

# read data
df = pd.read_csv(
    raw_data_path + f'train_r3_{target}_top_mentioned_timelines.csv', 
    sep = ';', 
    encoding='utf-8-sig'
    )

df.Polarity = df.Polarity.map({
    "against": 0,
    "for": 1
})

In [5]:
df.head()

Unnamed: 0,User_ID,Polarity,Texts
0,r2_ig_1,0,PQP ESSE DORAMA É MUITO FOADA(Sassy GoGo(Cheer...
1,r2_ig_4,1,Golaço!!!!!!!!! # Manda geral do time principa...
2,r2_ig_7,0,"@gabycunha86 Amanhã vou aí, deixa pra terça # ..."
3,r2_ig_8,1,3.4- O Centro de Coordenação da Operação está ...
4,r2_ig_10,1,"Me arrependi de excluir meu outro tt, agora ti..."


## Divide to train and test sets

In [6]:
texts = df["Texts"].tolist()
labels = df["Polarity"].tolist()
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

## Fit the model

In [7]:
bert_model_name = 'pablocosta/bertabaporu-base-uncased'

In [8]:
MODEL_PARAMS = {
    "batch_size": 1,
    "learning_rate": 5e-5,
    "epochs": 1,
    "chunk_size": 510,
    "stride": 510,
    "minimal_chunk_length": 510,
    "pooling_strategy": "mean",
    "pretrained_model_name_or_path": 'pablocosta/bertabaporu-base-uncased'
}
model = BertClassifierWithPooling(**MODEL_PARAMS, device="cuda")

In [9]:
torch.cuda.empty_cache()


model.fit(X_train, y_train, epochs=3)  #  Warning about tokeninizing too long text is expected

OutOfMemoryError: CUDA out of memory. Tried to allocate 116.00 MiB. GPU 0 has a total capacity of 5.79 GiB of which 133.19 MiB is free. Including non-PyTorch memory, this process has 5.17 GiB memory in use. Of the allocated memory 4.89 GiB is allocated by PyTorch, and 188.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

## Get predictions

In [None]:
preds = model.predict_classes(X_test)

## Calculate model accuracy on the test data

In [None]:
accurate = sum(preds == np.array(y_test).astype(bool))
accuracy = accurate / len(y_test)

print(f"Test accuracy: {accuracy}")