In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [2]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd
from textwrap import wrap

In [3]:
def json_to_pandas(json_corpus):
  with open(json_corpus) as f:
    corpus = json.load(f)

  subcategories = corpus.keys()

  unique_categories = []
  unique_subcategories = list(subcategories)
  categories_column = []
  subcategories_column = []
  paragraphs_column = []

  for subcategory in subcategories:
    cat_dict = corpus[subcategory]
    category = cat_dict["category"]
    print("Category: {}\tSubcategory: {}".format(category, subcategory))
    paragraphs = cat_dict["true"]
    if category not in unique_categories:
      unique_categories.append(category)
    categories_column.extend([category]*len(paragraphs))
    subcategories_column.extend([subcategory]*len(paragraphs))
    for paragraph in paragraphs:
      paragraphs_column.append(paragraph["paragraph"])

  return pd.DataFrame({'paragraph': paragraphs_column, 'category': categories_column, 'subcategory': subcategories_column}), unique_categories, unique_subcategories

In [4]:
# Initial parameters
ind = 1 # set to 0 to classify by categories, set to 1 to classify by subcategories
CLASSIFY_BY = ['category', 'subcategory'][ind]
RANDOM_SEED = 42
MAX_LEN = 250
test_prop = 0.2 # proportion of test data
BATCH_SIZE = 16 # training batches size

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
# Load dataset
DATASET_PATH = '/content/drive/My Drive/Naveler/03.TECH/data_discovery/datasets/tf_idf_data_corpus_data_corpus.json'

from google.colab import drive
drive.mount('/content/drive')

df, categories, subcategories = json_to_pandas(DATASET_PATH)

if CLASSIFY_BY == 'category':
  NCLASSES = len(categories)
elif CLASSIFY_BY == 'subcategory':
  NCLASSES = len(subcategories)

print("Number of classes: {}".format(NCLASSES))

Mounted at /content/drive
Category: Impacto energético	Subcategory: Calidad aire
Category: Impacto energético	Subcategory: Cambio climático
Category: Impacto energético	Subcategory: Gestión del agua
Category: Innovación	Subcategory: Eficiencia energética
Category: Innovación	Subcategory: GNL maritimo
Category: Innovación	Subcategory: Movilidad sostenible
Category: Innovación	Subcategory: Smart Grids
Category: Sector eléctrico	Subcategory: Consumo eléctrico
Category: Sector eléctrico	Subcategory: Redes eléctricas
Category: Sector eléctrico	Subcategory: Tarifas  / mediciones eléctricas
Category: Sector eléctrico	Subcategory: Vulnerabilidad energética
Number of classes: 11


In [6]:
# Check data
print(df.head())
print(df.shape)
ind = 10000
print("Text:")
print("\n".join(wrap(df['paragraph'][ind])))
print("Category: {}".format(df['category'][ind]))
print("Subcategory: {}".format(df['subcategory'][ind]))

                                           paragraph            category  \
0  RELATIVA A REVISAR LA ACTUAL CLASIFICACIÓN DE ...  Impacto energético   
1  b) Las actividades en el marco del régimen de ...  Impacto energético   
2  c) La compensación de los costes indirectos de...  Impacto energético   
3  d) Las actividades relacionadas con vertederos...  Impacto energético   
4  e) Las actividades en las que la eliminación a...  Impacto energético   

    subcategory  
0  Calidad aire  
1  Calidad aire  
2  Calidad aire  
3  Calidad aire  
4  Calidad aire  
(27644, 3)
Text:
Los redespachos por restricciones del PDBF se comprobarán horariamente
mientras no existan productos cuarto-horarios en los mercados diario e
intradiario y, por tanto, de BOLETÍN OFICIAL DEL ESTADO Martes 29 de
marzo de 2022 Sec. III. Pág. 41336 acuerdo con lo dispuesto en el PO
3.2. Los valores de energía y precio en todos los cuartos de hora de
la misma hora tengan el mismo valor.
Category: Sector eléctrico
Subca

In [7]:
# Code categories
print("Coded classes:")
if CLASSIFY_BY == 'category':
  label_names = categories
  for i in range(len(categories)):
    print("{}:\t{}".format(i, categories[i]))
  df['label'] = df.category.astype('category').cat.codes
elif CLASSIFY_BY == 'subcategory':
  label_names = subcategories
  for i in range(len(subcategories)):
    print("{}:\t{}".format(i, subcategories[i]))
  df['label'] = df.subcategory.astype('category').cat.codes
df.head()

Coded classes:
0:	Calidad aire
1:	Cambio climático
2:	Gestión del agua
3:	Eficiencia energética
4:	GNL maritimo
5:	Movilidad sostenible
6:	Smart Grids
7:	Consumo eléctrico
8:	Redes eléctricas
9:	Tarifas  / mediciones eléctricas
10:	Vulnerabilidad energética


Unnamed: 0,paragraph,category,subcategory,label
0,RELATIVA A REVISAR LA ACTUAL CLASIFICACIÓN DE ...,Impacto energético,Calidad aire,0
1,b) Las actividades en el marco del régimen de ...,Impacto energético,Calidad aire,0
2,c) La compensación de los costes indirectos de...,Impacto energético,Calidad aire,0
3,d) Las actividades relacionadas con vertederos...,Impacto energético,Calidad aire,0
4,e) Las actividades en las que la eliminación a...,Impacto energético,Calidad aire,0


In [8]:
df[CLASSIFY_BY].value_counts()

Redes eléctricas                    7913
Consumo eléctrico                   5784
Tarifas  / mediciones eléctricas    5491
Calidad aire                        1987
Eficiencia energética               1506
Gestión del agua                    1499
Vulnerabilidad energética           1372
Cambio climático                    1028
Smart Grids                          987
GNL maritimo                          76
Movilidad sostenible                   1
Name: subcategory, dtype: int64

In [9]:
# Define tokenizer
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [10]:
# Tokenization example
sample_txt = 'Las bebidas azucaradas son aquellas que contienen azúcar añadido.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  Las bebidas azucaradas son aquellas que contienen azúcar añadido.
Tokens:  ['Las', 'bebidas', 'azucar', '##adas', 'son', 'aquellas', 'que', 'contienen', 'azúcar', 'añadido', '.']
Tokens numéricos:  [1613, 12779, 28787, 1319, 1404, 8587, 1038, 11730, 7585, 12877, 1009]


In [11]:
# Codification for BERT (data pre-processing)
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    #pad_to_max_length = True,
    padding = 'max_length',
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [12]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [13]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'Las', 'bebidas', 'azucar', '##adas', 'son', 'aquellas', 'que', 'contienen', 'azúcar', 'añadido', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [14]:
# Define class for dataset
class BOEDataset(Dataset):

  def __init__(self, paragraphs, labels, tokenizer, max_len):
    self.paragraphs = paragraphs
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.paragraphs)
    
  def __getitem__(self, item):
    paragraph = str(self.paragraphs[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        paragraph,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        #pad_to_max_length = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    

    return {
          'paragraph': paragraph,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [15]:
# Define data loader
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = BOEDataset(
      paragraphs = df.paragraph.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )

  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 2)

In [16]:
# Split train and test data
df_train, df_test = train_test_split(df, test_size = test_prop, random_state=RANDOM_SEED)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [17]:
# Define class for building model
class BERTTextClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTTextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [18]:
# Build model (download)
model = BERTTextClassifier(NCLASSES)
model = model.to(device)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.we

In [19]:
# Check model
#print(model)
# BERT -> 31002 words codified into 768 values
# dropout layer
# linear layer (768xNCLASSES)

In [20]:
# Training parameters
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS # (8000/16)*5
scheduler = get_linear_schedule_with_warmup( # gradually decrease learning rate (step length)
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device) # loss function to be minimized



In [21]:
# Training iteration
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [22]:
# Training (high runtime!)
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------
Entrenamiento: Loss: 1.0544478634260568, accuracy: 0.5142211168889894
Validación: Loss: 0.8664743205198663, accuracy: 0.5700850063302586

Epoch 2 de 5
------------------
Entrenamiento: Loss: 0.759081559817341, accuracy: 0.6102645263395885
Validación: Loss: 0.819388715690271, accuracy: 0.5849158979924037

Epoch 3 de 5
------------------
Entrenamiento: Loss: 0.6591180837831029, accuracy: 0.6449468686411938
Validación: Loss: 0.8501773144554541, accuracy: 0.5854584915897992

Epoch 4 de 5
------------------
Entrenamiento: Loss: 0.6084585998016083, accuracy: 0.6627628306579245
Validación: Loss: 0.8764203649897107, accuracy: 0.5876288659793815

Epoch 5 de 5
------------------
Entrenamiento: Loss: 0.5787760801855758, accuracy: 0.6732986660637577
Validación: Loss: 0.8870691856896946, accuracy: 0.5657442575510943



In [23]:
# Model path
if CLASSIFY_BY == 'category':
  s = 'cat'
elif CLASSIFY_BY == 'subcategory':
  s = 'subcat'
MODEL_NAME = 'BERT_text_classifier_'+s
MODEL_PATH = '/content/drive/My Drive/Naveler/03.TECH/data_discovery/models/'+MODEL_NAME

In [24]:
# Save model
torch.save(model.state_dict(), MODEL_PATH)

In [25]:
# Upload model
loaded_model = BERTTextClassifier(NCLASSES)
loaded_model = loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
#model.eval()

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.we

<All keys matched successfully>

In [26]:
# Classify new data
def classifyParagraph(paragraph_text):
  encoding_paragraph = tokenizer.encode_plus(
      paragraph_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      #pad_to_max_length = True,
      padding='max_length',
      return_attention_mask = True,
      return_tensors = 'pt'
      )
  
  input_ids = encoding_paragraph['input_ids'].to(device)
  attention_mask = encoding_paragraph['attention_mask'].to(device)
  output = loaded_model(input_ids, attention_mask)
  print("\n".join(wrap(paragraph_text)))
  with torch.no_grad():
    G = nn.functional.softmax(output, dim=1)[0]
  if CLASSIFY_BY == 'category':
    k = 3
  elif CLASSIFY_BY == 'subcategory':
    k = 5
  #print(G.topk(k))
  for (p, y) in zip(*(G.topk(k))):
    print(f"{label_names[y.item()]} ({100 * p.item():.2f}%)")

In [27]:
# Classification examples
paragraph1 = "Las bebidas azucaradas son perjudiciales para la salud, además tienen un impacto negativo para el medio ambiente."

classifyParagraph(paragraph1)

Las bebidas azucaradas son perjudiciales para la salud, además tienen
un impacto negativo para el medio ambiente.
Eficiencia energética (75.37%)
Consumo eléctrico (14.84%)
Gestión del agua (5.45%)
Tarifas  / mediciones eléctricas (1.50%)
Cambio climático (0.72%)
