In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch

from belt_nlp.bert_with_pooling import BertClassifierWithPooling

# Example - Model BERT with pooling

In this notebook we will show how to use basic methods `fit` and `predict` for the BERT model with pooling.

In [2]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

## Load data - sample of IMDB reviews in english

In [3]:
raw_data_path = '../data/raw/'

In [4]:
target = 'ig'

# read data
df = pd.read_csv(
    raw_data_path + f'r3_{target}_test_users.csv', 
    sep = ';', 
    encoding='utf-8-sig'
    )

df.Polarity = df.Polarity.map({
    "against": 0,
    "for": 1
})

In [5]:
df.head()

Unnamed: 0,User_ID,Timeline,Stance,Polarity,Tweet_Seq
0,r2_ig_2,@ ta fzd oq # uma amiga minha tava na rua quan...,nao me diz que isso é de igreja católica pf,0,1874
1,r2_ig_3,@ Ola Como ta # Vamo Seguir @ estamos querendo...,Se a igreja faz isso ela devia ser isenta mesm...,0,3988
2,r2_ig_5,papai me deu um irmão lindo desse # cansada de...,"que pena então, por que se cada espírita for r...",0,4532
3,r2_ig_6,Né primeiro de abril não ta # Pena que um pais...,bglh é entrar p igreja,1,2661
4,r2_ig_9,já acordei nun desânimo que pqp # vontade de n...,já vou levar pra igreja pra Deus benzer pq o q...,1,1441


## Divide to train and test sets

In [6]:
texts = df["Stance"].tolist()
labels = df["Polarity"].tolist()
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

## Fit the model

In [7]:
bert_model_name = 'pablocosta/bertabaporu-base-uncased'

In [8]:
MODEL_PARAMS = {
    "batch_size": 1,
    "learning_rate": 5e-5,
    "epochs": 1,
    "chunk_size": 510,
    "stride": 510,
    "minimal_chunk_length": 510,
    "pooling_strategy": "mean",
    "pretrained_model_name_or_path": 'pablocosta/bertabaporu-base-uncased'
}
model = BertClassifierWithPooling(**MODEL_PARAMS, device="cuda")

In [9]:
model.fit(X_train, y_train, epochs=3)  #  Warning about tokeninizing too long text is expected

In [13]:
y_train

[0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,


## Get predictions

In [10]:
preds = model.predict_classes(X_test)

In [12]:
preds

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

## Calculate model accuracy on the test data

In [11]:
accurate = sum(preds == np.array(y_test).astype(bool))
accuracy = accurate / len(y_test)

print(f"Test accuracy: {accuracy}")

Test accuracy: 0.5583333333333333
