In [None]:
!pip install transformers
!pip install adapter-transformers
!pip install contractions
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1
Looking in i

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from transformers import BertModel
from transformers import get_scheduler

import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader

import time
import numpy as np
import os
from tqdm import tqdm
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [None]:
# spelling check
from textblob import TextBlob
import contractions

def normalize(data):
    d = data
    def correct(token_list):
        l = token_list.replace("'", "").strip("][").split(', ')
        # abbreviations
        l = [contractions.fix(ll) for ll in l ]
        s = ' '.join(l)
        s = str(TextBlob(s).correct().words)
        return s 
    d['Tokens'] = d['Tokens'].apply(lambda x: correct(x))
    return d

#load
data = pd.read_csv('/drive/MyDrive/Colab Notebooks/absa/data/restaurants_train.csv')
data_test = pd.read_csv('/drive/MyDrive/Colab Notebooks/absa/data/restaurants_test.csv')

data_test = normalize(data_test)
data = normalize(data)
data_test.to_csv('/drive/MyDrive/Colab Notebooks/absa/data/normalized/restaurants_test.csv', index=False)
data.to_csv('/drive/MyDrive/Colab Notebooks/absa/data/normalized/restaurants_train.csv', index=False)

KeyboardInterrupt: ignored

#Class ABTEDataset

In [None]:
class ABTEDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)
        
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)
        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)


#Class ABTEBert

In [None]:
class ABTEBert(torch.nn.Module):
    def __init__(self, pretrain_model, adapter=True):
        super(ABTEBert, self).__init__()
        self.adapter = adapter
        if adapter:
            from transformers.adapters import BertAdapterModel
            self.bert = BertAdapterModel.from_pretrained(pretrain_model)
        else: self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        bert_outputs= self.bert(input_ids=ids_tensors, attention_mask=masks_tensors, return_dict=False)
        bert_outputs = bert_outputs[0]

        linear_outputs = self.linear(bert_outputs)
        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,3)
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs


In [None]:
class ABTEModel ():
    def __init__(self, tokenizer, adapter=True):
        self.model = ABTEBert('bert-base-uncased', adapter=adapter)
        self.tokenizer = tokenizer
        self.trained = False
        self.adapter = adapter

    def padding(self, samples):
        from torch.nn.utils.rnn import pad_sequence
        ids_tensors = [s[1] for s in samples]
        ids_tensors = pad_sequence(ids_tensors, batch_first=True)

        tags_tensors = [s[2] for s in samples]
        tags_tensors = pad_sequence(tags_tensors, batch_first=True)

        pols_tensors = [s[3] for s in samples]
        pols_tensors = pad_sequence(pols_tensors, batch_first=True)
        
        masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
        masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
        return ids_tensors, tags_tensors, pols_tensors, masks_tensors

    def load_model(self, model, path):
        model.load_state_dict(torch.load(path), strict=False)
        
    def save_model(self, model, name):
        torch.save(model.state_dict(), name)             

    def train(self, data, epochs, device, batch_size=32, lr=1e-5, load_model=None, lr_schedule=True):

        #load model if lead_model is not None
        if load_model is not None:
            if os.path.exists(load_model):
                self.load_model(self.model, load_model)
                self.trained = True
            else:
                print("lead_model not found")

        # dataset and loader
        ds = ABTEDataset(data, self.tokenizer)
        loader = DataLoader(ds, batch_size=batch_size, shuffle=True, collate_fn=self.padding)
        
        self.model = self.model.to(device)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=lr)
        num_training_steps = epochs * len(loader)
        if lr_schedule: lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

        self.losses = []

        all_data = len(loader)-1
        for epoch in range(epochs):
            finish_data = 0
            current_times = []
            n_batches = int(len(data)/batch_size)

            if self.adapter:
                if lr_schedule: dir_name  = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE_adapter_scheduler"
                else: dir_name = dir_name  = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE_adapter"
            else:
                if lr_schedule: dir_name  = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE_scheduler"
                else: dir_name = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE"

            if not os.path.exists(dir_name):
                os.mkdir(dir_name)      

            for nb in range((n_batches)):
                t0 = time.time()

                ids_tensors, tags_tensors, _, masks_tensors = next(iter(loader))
                ids_tensor = ids_tensors.to(device)
                tags_tensor = tags_tensors.to(device)
                masks_tensor = masks_tensors.to(device)
                loss = self.model(ids_tensors=ids_tensor, tags_tensors=tags_tensor, masks_tensors=masks_tensor)
                self.losses.append(loss.item())
                loss.backward()
                optimizer.step()
                if lr_schedule: lr_scheduler.step()
                optimizer.zero_grad()

                finish_data += 1
                current_time = round(time.time() - t0,3)
                current_times.append(current_time)          
                print("epoch: {}\tbatch: {}/{}\tloss: {}\tbatch time: {}\ttotal time: {}"\
                    .format(epoch, finish_data, all_data, loss.item(), current_time, sum(current_times)))
            
                np.savetxt('{}/losses_lr{}_epochs{}_batch{}.txt'.format(dir_name, lr, epochs, batch_size), self.losses)

            self.save_model(self.model, '{}/model_lr{}_epochs{}_batch{}.pkl'.format(dir_name, lr, epoch, batch_size))
            self.trained = True

    def history (self):
        if self.trained:
            return self.losses
        else:
            raise Exception('Model not trained')

    def predict(self, sentence, load_model=None, device='cpu'):
         # load model if exists
        if load_model is not None:
            if os.path.exists(load_model):
                self.load_model(self.model, load_model)
            else:
                raise Exception('Model not found')
        else:
            if not self.trained:
                raise Exception('model not trained')

        word_pieces = list(self.tokenizer.tokenize(sentence))
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        input_tensor = torch.tensor([ids]).to(device)

        #predict
        with torch.no_grad():
            outputs = self.model(input_tensor, None, None)
            _, predictions = torch.max(outputs, dim=2)
            
        predictions = predictions[0].tolist() 
        return word_pieces, predictions, outputs
    
    def predict_batch(self, data, load_model=None, device='cpu'):

        tags_real = [t.strip('][').split(', ') for t in data['Tags']]
        tags_real = [[int(i) for i in t] for t in tags_real]

        # load model if exists
        if load_model is not None:
            if os.path.exists(load_model):
                self.load_model(self.model, load_model)
            else:
                raise Exception('Model not found')
        else:
            if not self.trained:
                raise Exception('model not trained')
        
        predictions = []

        for i in tqdm(range(len(data))):
            sentence = data['Tokens'][i]
            sentence = sentence.replace("'", "").strip("][").split(', ')
            sentence = ' '.join(sentence)
            w, p, _ = self.predict(sentence, load_model=load_model, device=device)
            predictions.append(p)
            tags_real[i] = tags_real[i][:len(p)]
            
        return predictions, tags_real

    def _accuracy (self, x,y):
        return np.mean(np.array(x) == np.array(y))

    def test(self, dataset, load_model=None, device='cpu'):
        from sklearn.metrics import classification_report
        # load model if exists
        if load_model is not None:
            if os.path.exists(load_model):
                self.load_model(self.model, load_model)
            else:
                raise Exception('Model not found')
        else:
            if not self.trained:
                raise Exception('model not trained')

         # dataset and loader
        ds = ABTEDataset(dataset, self.tokenizer)
        loader = DataLoader(ds, batch_size=50, shuffle=True, collate_fn=self.padding)

        pred = []#padded list
        trueth = [] #padded list
        with torch.no_grad():
            for data in tqdm(loader):
                
                ids_tensors, tags_tensors, _, masks_tensors = data
                ids_tensors = ids_tensors.to(device)
                tags_tensors = tags_tensors.to(device)
                masks_tensors = masks_tensors.to(device)

                outputs = self.model(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)
                
                _, p = torch.max(outputs, dim=2)

                pred += list([int(j) for i in p for j in i ])
                trueth += list([int(j) for i in tags_tensors for j in i ])
        
        acc = self._accuracy(pred, trueth)
        class_report = classification_report(trueth, pred, target_names=['none', 'start of AT', 'mark of AT'])
        return acc, class_report

    def accuracy(self, data, load_model=None, device='cpu'):
        a, p = self.test(data, load_model=load_model, device=device)
        return a

#Train the ABSAModel

In [None]:
batch = 8
lr = 3*1e-5
epochs = 5
from transformers import BertTokenizer
#load
data = pd.read_csv('/drive/MyDrive/Colab Notebooks/absa/data/normalized/restaurants_train.csv')
data_test = pd.read_csv('/drive/MyDrive/Colab Notebooks/absa/data/normalized/restaurants_test.csv')

dir_name  = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE_adapter_scheduler"

# define parameters for model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

adapter = True
lr_schedule = True
# define model
modelABTE = ABTEModel(tokenizer, adapter=adapter)
modelABTE.train(data, batch_size=batch, lr=lr, epochs=epochs, device=DEVICE, lr_schedule=lr_schedule)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertAdapterModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


epoch: 0	batch: 1/450	loss: 1.115323543548584	batch time: 2.757	total time: 2.757
epoch: 0	batch: 2/450	loss: 0.7111770510673523	batch time: 0.068	total time: 2.825
epoch: 0	batch: 3/450	loss: 0.4058113694190979	batch time: 0.047	total time: 2.8720000000000003
epoch: 0	batch: 4/450	loss: 0.40110647678375244	batch time: 0.051	total time: 2.9230000000000005
epoch: 0	batch: 5/450	loss: 0.30614346265792847	batch time: 0.051	total time: 2.9740000000000006
epoch: 0	batch: 6/450	loss: 0.5392554402351379	batch time: 0.05	total time: 3.0240000000000005
epoch: 0	batch: 7/450	loss: 0.5894246697425842	batch time: 0.049	total time: 3.0730000000000004
epoch: 0	batch: 8/450	loss: 0.4976477324962616	batch time: 0.049	total time: 3.1220000000000003
epoch: 0	batch: 9/450	loss: 0.27380773425102234	batch time: 0.051	total time: 3.1730000000000005
epoch: 0	batch: 10/450	loss: 0.30877479910850525	batch time: 0.051	total time: 3.2240000000000006
epoch: 0	batch: 11/450	loss: 0.4344360828399658	batch time: 0.0

#Test the Batch

In [None]:
# load model and predict
model_path = dir_name+'/model_lr3.0000000000000004e-05_epochs4_batch8.pkl'
test_accuracy, test_report = modelABTE.test(data_test, load_model=model_path, device=DEVICE)
test_pred, test_targets = modelABTE.predict_batch(data_test, load_model=model_path, device=DEVICE)

train_accuracy, train_report = modelABTE.test(data, load_model=model_path, device=DEVICE)
train_pred, train_targets = modelABTE.predict_batch(data, load_model=model_path, device=DEVICE)

100%|██████████| 23/23 [00:06<00:00,  3.32it/s]
100%|██████████| 1119/1119 [13:40<00:00,  1.36it/s]
100%|██████████| 73/73 [00:20<00:00,  3.56it/s]
100%|██████████| 3602/3602 [43:52<00:00,  1.37it/s]


In [None]:
#report
#report
with open(dir_name+'/results/test_report_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), 'w') as f:
  for r in test_report.split('\n'):
    f.write(r + '\n')

with open(dir_name+'/results/train_report_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), 'w') as f:
  for r in train_report.split('\n'):
    f.write(r + '\n')

    #predictions
data_test['Predicted'] = test_pred
data_test['Actual'] = test_targets
data_test.to_csv(dir_name+'/results/test_pred_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), index=False)

data['Predicted'] = train_pred
data['Actual'] = train_targets
data.to_csv(dir_name+'/results/train_pred_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), index=False)

    #accuracy
test_accuracy = np.array(test_accuracy)
train_accuracy = np.array(train_accuracy)

with open(dir_name+'/results/test_accuracy_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), 'w') as f:
  f.write(str(test_accuracy))
with open(dir_name+'/results/train_accuracy_lr{}_epochs{}_batch{}.csv'.format(lr, epochs, batch), 'w') as f:
  f.write(str(train_accuracy))

In [None]:
from textblob.blob import Sentence
from transformers import BertTokenizer
# define parameters for model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

adapter = True
lr_schedule = True
# define model
modelABTE = ABTEModel(tokenizer, adapter=adapter)
dir_name  = "/drive/MyDrive/Colab Notebooks/absa/model_ABTE_adapter_scheduler"
model_path = dir_name+'/model_lr3.0000000000000004e-05_epochs4_batch8.pkl'
sentence = "Atmosphere is chill and cool but the staff is also really friendly."
word_pieces, predictions, outputs = modelABTE.predict(sentence, load_model=model_path, device='cpu')





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertAdapterModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertAdapterModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertAdapterModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
print(word_pieces)
print(predictions)
print(outputs)

['atmosphere', 'is', 'chill', 'and', 'cool', 'but', 'the', 'staff', 'is', 'also', 'really', 'friendly', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
tensor([[[ 3.0768,  2.1194, -4.5677],
         [ 6.4338, -1.8679, -3.4791],
         [ 6.6185, -2.8331, -2.7230],
         [ 5.9619, -1.4387, -3.2626],
         [ 6.8212, -3.0068, -2.5609],
         [ 6.1326, -0.9728, -3.8838],
         [ 5.4816, -1.2545, -3.4891],
         [ 3.4995,  2.2640, -5.2561],
         [ 1.5518,  2.4109, -3.4966],
         [ 6.3532, -1.8145, -3.5202],
         [ 6.8627, -2.2187, -3.2864],
         [ 6.9981, -2.7160, -2.9055],
         [ 5.9484, -1.3807, -3.5187]]])
