In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 31.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import json
import pandas as pd
from textwrap import wrap

In [None]:
def json_to_pandas1(json_corpus):
  with open(json_corpus) as f:
    corpus = json.load(f)

  subcategories = corpus.keys()

  categories_column = []
  subcategories_column = []
  paragraphs_column = []

  for subcategory in subcategories:
    cat_dict = corpus[subcategory]
    category = cat_dict["category"]
    paragraphs = cat_dict["true"]
    categories_column.extend([category]*len(paragraphs))
    subcategories_column.extend([subcategory]*len(paragraphs))
    for paragraph in paragraphs:
      paragraphs_column.append(paragraph["paragraph"])

  return pd.DataFrame({'paragraph': paragraphs_column, 'category': categories_column, 'subcategory': subcategories_column})

In [None]:
def json_to_pandas2(json_corpus):
  with open(json_corpus) as f:
    corpus = json.load(f)

  categories_column = []
  subcategories_column = []
  paragraphs_column = []

  for dictionary in corpus:
    category = dictionary["Categoría"].strip()
    subcategory = dictionary["Subcategoría"].strip()
    content = dictionary["content"].split('\n')
    categories_column.extend([category]*len(content))
    subcategories_column.extend([subcategory]*len(content))
    for paragraph in content:
      paragraphs_column.append(paragraph)

  return pd.DataFrame({'paragraph': paragraphs_column, 'category': categories_column, 'subcategory': subcategories_column})

In [None]:
# Load and concatenate datasets
DATASET1_PATH = '/content/drive/My Drive/Naveler/03.TECH/data_discovery/datasets/tf_idf_data_corpus_data_corpus.json'
DATASET2_PATH = '/content/drive/My Drive/Naveler/03.TECH/data_discovery/datasets/tf_idf_data_output_wiki_content.json'

from google.colab import drive
drive.mount('/content/drive')

df1 = json_to_pandas1(DATASET1_PATH)
df2 = json_to_pandas2(DATASET2_PATH)
df = pd.concat([df1, df2], ignore_index=True)

Mounted at /content/drive


In [None]:
# Initial parameters
ind = 0 # set to 0 to classify by categories, set to 1 to classify by subcategories
CLASSIFY_BY = ['category', 'subcategory'][ind]
RANDOM_SEED = 42
MAX_LEN = 250
test_prop = 0.2 # proportion of test data
BATCH_SIZE = 16 # training batches size

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [None]:
# Check data
print("Number of samples: {}".format(df.shape[0]))
ind = 10000
print("\nSample example:")
print("\n".join(wrap(df['paragraph'][ind])))
print("Category: {}".format(df['category'][ind]))
print("Subcategory: {}".format(df['subcategory'][ind]))

Number of samples: 29053

Sample example:
Los redespachos por restricciones del PDBF se comprobarán horariamente
mientras no existan productos cuarto-horarios en los mercados diario e
intradiario y, por tanto, de BOLETÍN OFICIAL DEL ESTADO Martes 29 de
marzo de 2022 Sec. III. Pág. 41336 acuerdo con lo dispuesto en el PO
3.2. Los valores de energía y precio en todos los cuartos de hora de
la misma hora tengan el mismo valor.
Category: Sector eléctrico
Subcategory: Consumo eléctrico


In [None]:
# Code categories
if CLASSIFY_BY == 'category':
  possible_labels = df.category.unique()
elif CLASSIFY_BY == 'subcategory':
  possible_labels = df.subcategory.unique()
NCLASSES = len(possible_labels)
print("Number of classes: {}\n".format(NCLASSES))

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

Number of classes: 4



{'Alimentación y bebidas': 3,
 'Impacto energético': 0,
 'Innovación': 1,
 'Sector eléctrico': 2}

In [None]:
# Add label column
if CLASSIFY_BY == 'category':
  df['label'] = df.category.replace(label_dict)
elif CLASSIFY_BY == 'subcategory':
  df['label'] = df.subcategory.replace(label_dict)
df.head()

Unnamed: 0,paragraph,category,subcategory,label
0,RELATIVA A REVISAR LA ACTUAL CLASIFICACIÓN DE ...,Impacto energético,Calidad aire,0
1,b) Las actividades en el marco del régimen de ...,Impacto energético,Calidad aire,0
2,c) La compensación de los costes indirectos de...,Impacto energético,Calidad aire,0
3,d) Las actividades relacionadas con vertederos...,Impacto energético,Calidad aire,0
4,e) Las actividades en las que la eliminación a...,Impacto energético,Calidad aire,0


In [None]:
df[CLASSIFY_BY].value_counts()

Sector eléctrico          20934
Impacto energético         4880
Innovación                 3118
Alimentación y bebidas      121
Name: category, dtype: int64

In [None]:
# Define tokenizer
PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [None]:
# Tokenization example
sample_txt = 'Las bebidas azucaradas son aquellas que contienen azúcar añadido.'
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print('Frase: ', sample_txt)
print('Tokens: ', tokens)
print('Tokens numéricos: ', token_ids)

Frase:  Las bebidas azucaradas son aquellas que contienen azúcar añadido.
Tokens:  ['Las', 'bebidas', 'azucar', '##adas', 'son', 'aquellas', 'que', 'contienen', 'azúcar', 'añadido', '.']
Tokens numéricos:  [1613, 12779, 28787, 1319, 1404, 8587, 1038, 11730, 7585, 12877, 1009]


In [None]:
# Codification for BERT (data pre-processing)
encoding = tokenizer.encode_plus(
    sample_txt,
    max_length = MAX_LEN,
    truncation = True,
    add_special_tokens = True,
    return_token_type_ids = False,
    #pad_to_max_length = True,
    padding = 'max_length',
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [None]:
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))
print(encoding['input_ids'][0])
print(encoding['attention_mask'][0])

['[CLS]', 'Las', 'bebidas', 'azucar', '##adas', 'son', 'aquellas', 'que', 'contienen', 'azúcar', 'añadido', '.', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD

In [None]:
# Define class for dataset
class BOEDataset(Dataset):

  def __init__(self, paragraphs, labels, tokenizer, max_len):
    self.paragraphs = paragraphs
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
      return len(self.paragraphs)
    
  def __getitem__(self, item):
    paragraph = str(self.paragraphs[item])
    label = self.labels[item]
    encoding = tokenizer.encode_plus(
        paragraph,
        max_length = self.max_len,
        truncation = True,
        add_special_tokens = True,
        return_token_type_ids = False,
        #pad_to_max_length = True,
        padding = 'max_length',
        return_attention_mask = True,
        return_tensors = 'pt'
        )
    return {
          'paragraph': paragraph,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
      }

In [None]:
# Define data loader
def data_loader(df, tokenizer, max_len, batch_size):
  dataset = BOEDataset(
      paragraphs = df.paragraph.to_numpy(),
      labels = df.label.to_numpy(),
      tokenizer = tokenizer,
      max_len = MAX_LEN
  )
  return DataLoader(dataset, batch_size = BATCH_SIZE, num_workers = 2)

In [None]:
# Split train and test data
df_train, df_test = train_test_split(df, test_size = test_prop, random_state=RANDOM_SEED, stratify=df.label.values)

train_data_loader = data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
df_train[CLASSIFY_BY].value_counts()

Sector eléctrico          16747
Impacto energético         3904
Innovación                 2494
Alimentación y bebidas       97
Name: category, dtype: int64

In [None]:
df_test[CLASSIFY_BY].value_counts()

Sector eléctrico          4187
Impacto energético         976
Innovación                 624
Alimentación y bebidas      24
Name: category, dtype: int64

In [None]:
# Define class for building model
class BERTTextClassifier(nn.Module):

  def __init__(self, n_classes):
    super(BERTTextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, cls_output = self.bert(
        input_ids = input_ids,
        attention_mask = attention_mask
    )
    drop_output = self.drop(cls_output)
    output = self.linear(drop_output)
    return output

In [None]:
# Build model (download)
model = BERTTextClassifier(NCLASSES)
model = model.to(device)

Downloading:   0%|          | 0.00/419M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [None]:
# Check model
#print(model)
# BERT -> 31002 words codified into 768 values
# dropout layer
# linear layer (768xNCLASSES)

In [None]:
# Training parameters
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS # (8000/16)*5
scheduler = get_linear_schedule_with_warmup( # gradually decrease learning rate (step length)
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device) # loss function to be minimized



In [None]:
# Training iteration
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0
  for batch in data_loader:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids = input_ids, attention_mask = attention_mask)
    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, labels)
    correct_predictions += torch.sum(preds == labels)
    losses.append(loss.item())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  return correct_predictions.double()/n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, labels)
      correct_predictions += torch.sum(preds == labels)
      losses.append(loss.item())
  return correct_predictions.double()/n_examples, np.mean(losses)

In [None]:
# Training (high runtime!)
for epoch in range(EPOCHS):
  print('Epoch {} de {}'.format(epoch+1, EPOCHS))
  print('------------------')
  train_acc, train_loss = train_model(
      model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train)
  )
  test_acc, test_loss = eval_model(
      model, test_data_loader, loss_fn, device, len(df_test)
  )
  print('Entrenamiento: Loss: {}, accuracy: {}'.format(train_loss, train_acc))
  print('Validación: Loss: {}, accuracy: {}'.format(test_loss, test_acc))
  print('')

Epoch 1 de 5
------------------
Entrenamiento: Loss: 0.310767508890026, accuracy: 0.8817227433095258
Validación: Loss: 0.255849498812145, accuracy: 0.912751677852349

Epoch 2 de 5
------------------
Entrenamiento: Loss: 0.1472999248846741, accuracy: 0.947594871353584
Validación: Loss: 0.2529621165052919, accuracy: 0.9287558079504389

Epoch 3 de 5
------------------
Entrenamiento: Loss: 0.09480671025784583, accuracy: 0.9663970398416659
Validación: Loss: 0.29424937246113875, accuracy: 0.9323696437790399

Epoch 4 de 5
------------------
Entrenamiento: Loss: 0.07432034050626984, accuracy: 0.9719043111608295
Validación: Loss: 0.2867864951489997, accuracy: 0.9375322663913269

Epoch 5 de 5
------------------
Entrenamiento: Loss: 0.05807039098704284, accuracy: 0.976163841321745
Validación: Loss: 0.2864310697145561, accuracy: 0.9397694028566512



In [None]:
# Model path
if CLASSIFY_BY == 'category':
  s = 'cat'
elif CLASSIFY_BY == 'subcategory':
  s = 'subcat'
MODEL_NAME = 'BERT_text_classifier_'+s
MODEL_PATH = '/content/drive/My Drive/Naveler/03.TECH/data_discovery/models/'+MODEL_NAME

In [None]:
# Save model
torch.save(model.state_dict(), MODEL_PATH)

In [None]:
# Upload model
loaded_model = BERTTextClassifier(NCLASSES)
loaded_model = loaded_model.to(device)
loaded_model.load_state_dict(torch.load(MODEL_PATH))
#model.eval()

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

<All keys matched successfully>

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = preds.flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
def evaluate(model, data_loader):
  model = model.eval()
  predictions = []
  labels = []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      label_ids = batch['label'].cpu().numpy()
      labels.append(label_ids)
      outputs = model(input_ids = input_ids, attention_mask = attention_mask)
      _, preds = torch.max(outputs, dim=1)
      preds = preds.detach().cpu().numpy()
      predictions.append(preds)

  predictions = np.concatenate(predictions, axis=0)
  labels = np.concatenate(labels, axis=0)
  return predictions, labels

In [None]:
# Calculate accuracy per class on test data
predictions, labels = evaluate(loaded_model, test_data_loader)
accuracy_per_class(predictions, labels)

Class: Impacto energético
Accuracy: 850/976

Class: Innovación
Accuracy: 548/624

Class: Sector eléctrico
Accuracy: 4048/4187

Class: Alimentación y bebidas
Accuracy: 15/24



In [None]:
# Classify new data
def classifyParagraph(paragraph_text):
  encoding_paragraph = tokenizer.encode_plus(
      paragraph_text,
      max_length = MAX_LEN,
      truncation = True,
      add_special_tokens = True,
      return_token_type_ids = False,
      #pad_to_max_length = True,
      padding='max_length',
      return_attention_mask = True,
      return_tensors = 'pt'
      )
  
  input_ids = encoding_paragraph['input_ids'].to(device)
  attention_mask = encoding_paragraph['attention_mask'].to(device)
  output = loaded_model(input_ids, attention_mask)
  print("\n".join(wrap(paragraph_text)))
  with torch.no_grad():
    G = nn.functional.softmax(output, dim=1)[0]
  k = len(possible_labels)
  #print(G.topk(k))
  for (p, y) in zip(*(G.topk(k))):
    print(f"{possible_labels[y.item()]} ({100 * p.item():.2f}%)")

In [None]:
# Classification example
paragraph = "Las bebidas azucaradas son perjudiciales para la salud, además tienen un impacto negativo para el medio ambiente."

classifyParagraph(paragraph)

Las bebidas azucaradas son perjudiciales para la salud, además tienen
un impacto negativo para el medio ambiente.
Alimentación y bebidas (99.80%)
Innovación (0.16%)
Impacto energético (0.03%)
Sector eléctrico (0.01%)
