In [1]:
!nvidia-smi

Tue Jun 28 13:23:24 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! pip install -q datasets
! pip install torch==1.6.0
! pip install transformers
! pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import pandas as pd
import numpy as np
import os
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/kaggle//us-patent/train.csv')
eval_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/kaggle//us-patent/test.csv')

In [5]:
df['input'] = 'TEXT1:' + df.context + '; TEXT2:' + df.target + '; ANC1:' + df.anchor
df.head()

Unnamed: 0,id,anchor,target,context,score,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,TEXT1:A47; TEXT2:abatement of pollution; ANC1:...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,TEXT1:A47; TEXT2:act of abating; ANC1:abatement
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,TEXT1:A47; TEXT2:active catalyst; ANC1:abatement
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,TEXT1:A47; TEXT2:eliminating process; ANC1:aba...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,TEXT1:A47; TEXT2:forest region; ANC1:abatement


In [6]:
from datasets import Dataset,DatasetDict
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [7]:
from transformers import AlbertTokenizer
import torch

albert_model = 'tals/albert-base-mnli'
tokenizer = AlbertTokenizer.from_pretrained(albert_model)

In [8]:
tokenizer.tokenize("A platypus is an ornithorhynchus anatinus.")

['▁a',
 '▁platy',
 'pus',
 '▁is',
 '▁an',
 '▁',
 'ornith',
 'o',
 'rhynch',
 'us',
 '▁an',
 'at',
 'inus',
 '.']

In [9]:
tokenizer("A platypus is an ornithorhynchus anatinus.")

{'input_ids': [2, 21, 26138, 8032, 25, 40, 13, 22210, 111, 29507, 267, 40, 721, 9585, 9, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
def tok_func(x): return tokenizer(x['input'])

In [11]:
tok_ds = ds.map(tok_func, batched = True)



  0%|          | 0/37 [00:00<?, ?ba/s]

In [12]:
tok_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36473
})

In [13]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [14]:
from numpy.random import normal,seed,uniform
np.random.seed(2022)
train_and_val = tok_ds.train_test_split(0.25, seed = 2022)
train_and_val

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [15]:
eval_df['input'] = 'TEXT1:' + df.context + '; TEXT2:' + df.target + '; ANC1:' + df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched = True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
eval_ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36
})

In [17]:
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim

class MyDataSet(Data.Dataset):
  def __init__(self, data, labels):
    self.data = data
    self.labels = labels
    self.tokenizer = AlbertTokenizer.from_pretrained(albert_model)

  def __getitem__(self, idx):
    text = self.data[idx]
    label = self.labels[idx]
    inputs = self.tokenizer(text, return_tensors = 'pt', padding = 'max_length', max_length = 30, truncation = True)
    input_ids = inputs.input_ids.squeeze(0)
    token_type_ids = inputs.token_type_ids.squeeze(0)
    attention_mask = inputs.attention_mask.squeeze(0)
    return input_ids, token_type_ids, attention_mask, label

  def __len__(self):
    return len(self.data)

In [18]:
data = tok_ds['input']
label = tok_ds['labels']

dataset = MyDataSet(data, label)
dataloader = Data.DataLoader(dataset, batch_size = 128, shuffle = True)

In [19]:
data[:5]

['TEXT1:A47; TEXT2:abatement of pollution; ANC1:abatement',
 'TEXT1:A47; TEXT2:act of abating; ANC1:abatement',
 'TEXT1:A47; TEXT2:active catalyst; ANC1:abatement',
 'TEXT1:A47; TEXT2:eliminating process; ANC1:abatement',
 'TEXT1:A47; TEXT2:forest region; ANC1:abatement']

In [22]:
label[:5]

[0.5, 0.75, 0.25, 0.5, 0.0]

In [23]:
import torch.nn as nn
from transformers import AlbertModel
class MyModel(nn.Module):
  def __init__(self):
    super(MyModel, self).__init__()
    self.albert = AlbertModel.from_pretrained(albert_model)
    self.linear = nn.Linear(768, 3)

  def forward(self, input_ids, token_type_ids, attention_mask):
    output = self.albert(input_ids, token_type_ids, attention_mask).pooler_output
    output = self.linear(output)

    return output

In [24]:
device = torch.device('cuda')
model = MyModel().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 1e-8)

Some weights of the model checkpoint at tals/albert-base-mnli were not used when initializing AlbertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
for ep in range(8):
  
  for input_ids, token_type_ids, attention_mask, label in dataloader:
    input_ids, token_type_ids, attention_mask, label = input_ids.to(device), token_type_ids.to(device), attention_mask.to(device), label.to(device)
    label = label.long()
    pred = model(input_ids, token_type_ids, attention_mask)
    loss = loss_fn(pred, label)
    print(loss.item())

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

1.1875933408737183
1.1893223524093628
1.137373685836792
1.1700977087020874
1.1547333002090454
1.1382195949554443
1.1523511409759521
1.1631666421890259
1.1569130420684814
1.1788380146026611
1.150728464126587
1.1895352602005005
1.1685689687728882
1.189612627029419
1.1509522199630737
1.157251238822937
1.1945807933807373
1.1825227737426758
1.161388635635376
1.1716880798339844
1.1522777080535889
1.178099274635315
1.1727304458618164
1.1901851892471313
1.1515142917633057
1.1508690118789673
1.1344910860061646
1.1754119396209717
1.179215908050537
1.1511051654815674


KeyboardInterrupt: ignored