In [1]:
import pandas as pd;import numpy as np
import torch
from tqdm.notebook import tqdm



from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from torch.utils.data import TensorDataset



In [2]:
data = pd.read_csv('data_df.csv')
del data['Unnamed: 0']
data.head()

Unnamed: 0,class_name,text
0,Aile bascisini itirmeye gore muavinet,almamısınızsa ala bilməzsiz ailə başcısını iti...
1,Aile bascisini itirmeye gore muavinet,atası rəhmətə getmiş 11 sinif şagirdinin alaca...
2,Aile bascisini itirmeye gore muavinet,eri ölmüs qadin sosial yardimda ala bilmez 58 ...
3,Aile bascisini itirmeye gore muavinet,erim atasi rehmete getmish usaqin atasi rehmet...
4,Aile bascisini itirmeye gore muavinet,eri olmush qadin ne muavinet ala biler


In [3]:
possible_labels = data.class_name.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'Aile bascisini itirmeye gore muavinet': 0,
 'Ananin usaqlara gore guzestli sertlerle yasa gore emek pensiyasi': 1,
 'Bosanma': 2}

In [4]:
data['label'] = data.class_name.replace(label_dict)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(data.index.values, 
                                                  data.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify= data.label.values)

data['data_type'] = ['not_set']*data.shape[0]

data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'

data.groupby(['class_name', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
class_name,label,data_type,Unnamed: 3_level_1
Aile bascisini itirmeye gore muavinet,0,train,14
Aile bascisini itirmeye gore muavinet,0,val,3
Ananin usaqlara gore guzestli sertlerle yasa gore emek pensiyasi,1,train,15
Ananin usaqlara gore guzestli sertlerle yasa gore emek pensiyasi,1,val,3
Bosanma,2,train,18
Bosanma,2,val,3


In [16]:
model_name = "bert-base-multilingual-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name, 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:

model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      ignore_mismatched_sizes=True)

Downloading pytorch_model.bin:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [18]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [27]:

from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [28]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [29]:
device = torch.device('cpu')

In [30]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_multilingual_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.7740887515246868
Validation loss: 0.781107227007548
F1 Score (Weighted): 0.7833333333333334


Epoch 2:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.608492823317647
Validation loss: 0.5109472771485647
F1 Score (Weighted): 1.0


Epoch 3:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.46936918515712023
Validation loss: 0.3900478035211563
F1 Score (Weighted): 1.0


Epoch 4:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.2898265542462468
Validation loss: 0.27087781329949695
F1 Score (Weighted): 1.0


Epoch 5:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.193332992028445
Validation loss: 0.1568263197938601
F1 Score (Weighted): 1.0


Epoch 6:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.12402531877160072
Validation loss: 0.17500878125429153
F1 Score (Weighted): 0.8857142857142857


Epoch 7:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.08504247665405273
Validation loss: 0.0987250084678332
F1 Score (Weighted): 1.0


Epoch 8:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.05908830533735454
Validation loss: 0.09326894705494244
F1 Score (Weighted): 1.0


Epoch 9:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.04930536018218845
Validation loss: 0.08546831458806992
F1 Score (Weighted): 1.0


Epoch 10:   0%|          | 0/16 [00:00<?, ?it/s]


Epoch 10
Training loss: 0.04316428815945983
Validation loss: 0.08473335641125838
F1 Score (Weighted): 1.0


In [33]:
model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False,
                                                      ignore_mismatched_sizes=True
                                                      )

model.to(device)

model.load_state_dict(torch.load('data_volume/finetuned_multilingual_BERT_epoch_10.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

Class: Aile bascisini itirmeye gore muavinet
Accuracy: 3/3

Class: Ananin usaqlara gore guzestli sertlerle yasa gore emek pensiyasi
Accuracy: 3/3

Class: Bosanma
Accuracy: 3/3

