# A full training

Install the Transformers and Datasets libraries to run this notebook.

In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
df = pd.read_csv('2000.csv')

In [3]:
df.head()

Unnamed: 0,id,text,category
0,20604786,This paper uses a new data set to estimate the...,Yes
1,8334260,This study was aimed at testing the hypothesis...,Yes
2,62812273,This paper considers a flexible panel data sam...,Yes
3,21740546,This project describes application of an evide...,Yes
4,154379635,Security regulators and the business press hav...,Yes


In [4]:
df.category.value_counts()

Yes    1194
No      816
Name: category, dtype: int64

In [5]:
possible_labels = df.category.unique()

In [6]:
label_dict = {}
for index, possible_labels in enumerate(possible_labels):
  label_dict[possible_labels] = index

In [7]:
label_dict

{'Yes': 0, 'No': 1}

In [8]:
df['label'] = df.category.replace(label_dict)
df.head()

Unnamed: 0,id,text,category,label
0,20604786,This paper uses a new data set to estimate the...,Yes,0
1,8334260,This study was aimed at testing the hypothesis...,Yes,0
2,62812273,This paper considers a flexible panel data sam...,Yes,0
3,21740546,This project describes application of an evide...,Yes,0
4,154379635,Security regulators and the business press hav...,Yes,0


In [9]:
df['text'] = df['text'].astype(str)

In [10]:
df['number_of_words'] = df.text.apply(lambda x: len(x.split()))

In [11]:
df = df[df['number_of_words']<350]

In [12]:
len(df)

1948

In [13]:
df.head()

Unnamed: 0,id,text,category,label,number_of_words
0,20604786,This paper uses a new data set to estimate the...,Yes,0,98
1,8334260,This study was aimed at testing the hypothesis...,Yes,0,206
2,62812273,This paper considers a flexible panel data sam...,Yes,0,159
3,21740546,This project describes application of an evide...,Yes,0,122
4,154379635,Security regulators and the business press hav...,Yes,0,159


In [14]:
df.drop('number_of_words',axis = 1,inplace = True)
df.head()

Unnamed: 0,id,text,category,label
0,20604786,This paper uses a new data set to estimate the...,Yes,0
1,8334260,This study was aimed at testing the hypothesis...,Yes,0
2,62812273,This paper considers a flexible panel data sam...,Yes,0
3,21740546,This project describes application of an evide...,Yes,0
4,154379635,Security regulators and the business press hav...,Yes,0


In [15]:
df.label.value_counts()

0    1156
1     792
Name: label, dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size = 0.15,
    random_state = 17,
    stratify = df.label.values
)

In [18]:
df['data_type'] = ['not_set']*df.shape[0]

In [19]:
df.head()

Unnamed: 0,id,text,category,label,data_type
0,20604786,This paper uses a new data set to estimate the...,Yes,0,not_set
1,8334260,This study was aimed at testing the hypothesis...,Yes,0,not_set
2,62812273,This paper considers a flexible panel data sam...,Yes,0,not_set
3,21740546,This project describes application of an evide...,Yes,0,not_set
4,154379635,Security regulators and the business press hav...,Yes,0,not_set


In [20]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [21]:
df.groupby(['label','data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,category
label,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,train,982,982,982
0,val,174,174,174
1,train,673,673,673
1,val,119,119,119


In [22]:
df.data_type.value_counts()

train    1655
val       293
Name: data_type, dtype: int64

In [23]:
df.head()

Unnamed: 0,id,text,category,label,data_type
0,20604786,This paper uses a new data set to estimate the...,Yes,0,train
1,8334260,This study was aimed at testing the hypothesis...,Yes,0,train
2,62812273,This paper considers a flexible panel data sam...,Yes,0,train
3,21740546,This project describes application of an evide...,Yes,0,train
4,154379635,Security regulators and the business press hav...,Yes,0,train


In [24]:
!pip install transformers



In [25]:
# loading tokenizer and encoding our data
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [26]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-cased',
    do_lower_case = True
)

In [27]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    padding='longest',
    truncation=True,
    return_tensors = 'pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens = True,
    return_attention_mask = True,
    padding='longest',
    truncation=True,
    return_tensors = 'pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [28]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)

In [29]:
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [30]:
len(dataset_train)

1655

In [31]:
len(dataset_val)

293

In [32]:
from transformers import BertForSequenceClassification

In [33]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [34]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [35]:
batch_size = 8
dataloader_train = DataLoader(
    dataset_train,
    #shuffle=True,
    sampler = RandomSampler(dataset_train), 
    batch_size = batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    #shuffle=True,
    sampler = RandomSampler(dataset_val), 
    batch_size = 32
)

In [36]:
!pip install torch



In [37]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [38]:
import transformers

In [39]:
from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler

In [40]:
optimizer = AdamW(model.parameters(), lr=1e-5)



In [41]:
epochs = 16

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

In [42]:
import numpy as np
from sklearn.metrics import f1_score

In [43]:
def f1_score_func(pred, labels):
  preds_flat = np.argmax(pred, axis =1).flatten()
  labels_flat = labels.flatten()
  return f1_score(labels_flat, preds_flat, average = 'weighted')

In [44]:
def accuracy_per_class(preds, labels):
  label_dict_inverse = {v: k for k,v in label_dict.items()}

  preds_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()

  for label in np.unique(labels_flat):
    y_preds = preds_flat[labels_flat==label]
    y_true = labels_flat[labels_flat==label]
    print(f'Class:{label_dict_inverse[label]}')
    print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [45]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [46]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)


cpu


In [47]:
def evaluate(dataloader_val):
  model.eval()
  loss_val_total = 0
  predictions, true_vals = [], []
  for batch in dataloader_val:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]
              }
    with torch.no_grad():
      outputs = model(**inputs)
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)
  loss_val_avg = loss_val_total/len(dataloader_val)

  predictions = np.concatenate(predictions, axis = 0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_total, predictions,true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
  model.train()
  loss_train_total = 0
  progress_bar = tqdm(dataloader_train, desc='Epoch{:1d}'.format(epoch),
                      leave = False,
                      disable = False)
  for batch in progress_bar:
    model.zero_grad()
 
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0],
              'attention_mask': batch[1],
              'labels': batch[2]
    }
    
    outputs = model(**inputs)



    loss = outputs[0]
    loss_train_total += loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    optimizer.step()
    scheduler.step()

    progress_bar.set_postfix({'training_loss':'{:.3f}'.format(loss.item()/len(batch))})

  torch.save(model.state_dict(),f'Models/BERT_ft_epoch{epoch}.model')
  tqdm.write(f'\nEpoch {epoch}')
  loss_train_avg = loss_train_total/len(dataloader_train)
  tqdm.write(f'Training loss: {loss_train_avg}')
   #logits = logits.detach().cpu().numpy()
  val_loss, predictions, true_vals = evaluate(dataloader_val)
  val_f1 = f1_score_func(predictions, true_vals)
  tqdm.write(f'Validation loss: {val_loss}')
  tqdm.write(f'F1 Score (weighted): {val_f1}') 
    

  0%|          | 0/16 [00:00<?, ?it/s]

Epoch1:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.6732385188773058
Validation loss: 6.476194441318512
F1 Score (weighted): 0.5263367463026166


Epoch2:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.6558820043784984
Validation loss: 5.942395776510239
F1 Score (weighted): 0.7012710525612904


Epoch3:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5825581205783835
Validation loss: 5.2347725331783295
F1 Score (weighted): 0.7132426781980953


Epoch4:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.49004949197389075
Validation loss: 4.136270806193352
F1 Score (weighted): 0.8255713884063314


Epoch5:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.37688708255868314
Validation loss: 4.990728586912155
F1 Score (weighted): 0.8475918459936278


Epoch6:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.2913164408148631
Validation loss: 6.620509654283524
F1 Score (weighted): 0.8639005063722809


Epoch7:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.1828944532128708
Validation loss: 4.607147637289017
F1 Score (weighted): 0.8894839062470536


Epoch8:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.11005594411185506
Validation loss: 4.4085115059278905
F1 Score (weighted): 0.9143645511297438


Epoch9:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.08952851191664718
Validation loss: 4.351907312870026
F1 Score (weighted): 0.9244524993705932


Epoch10:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.05499984429623923
Validation loss: 4.508957386715338
F1 Score (weighted): 0.9278226693750662


Epoch11:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 11
Training loss: 0.052723278024668044
Validation loss: 4.684338425111491
F1 Score (weighted): 0.9278226693750662


Epoch12:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 13
Training loss: 0.022183823754678027
Validation loss: 4.596676076063886
F1 Score (weighted): 0.9276891346307556


Epoch14:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 14
Training loss: 0.03486523790659545
Validation loss: 6.200344815850258
F1 Score (weighted): 0.9276891346307556


Epoch15:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 15
Training loss: 0.03346702036001283
Validation loss: 4.986365563061554
F1 Score (weighted): 0.9276891346307556


Epoch16:   0%|          | 0/207 [00:00<?, ?it/s]


Epoch 16
Training loss: 0.010635504123821162
Validation loss: 4.72424460836919
F1 Score (weighted): 0.9276891346307556


In [55]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels = len(label_dict),
                                                      output_attentions = False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [56]:
model.to(device)
pass

In [62]:
model.load_state_dict(
    torch.load('Models/BERT_ft_epoch16.model',
               map_location = torch.device('cpu')))

<All keys matched successfully>

In [63]:
_, predictions, true_vals = evaluate(dataloader_val)

In [64]:
accuracy_per_class(predictions, true_vals)

Class:Yes
Accuracy:169/174

Class:No
Accuracy:103/119

