In [1]:
import os

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from utils import clean_text, le


In [3]:
train_df = pd.read_json('data/snli_1.0_train.jsonl', lines=True)
test_df = pd.read_json('data/snli_1.0_test.jsonl', lines=True)
dev_df = pd.read_json('data/snli_1.0_dev.jsonl', lines=True)

train_df = train_df.drop(train_df[train_df['gold_label']=='-'].index)
train_df = train_df.sample(frac=0.02, random_state=42).reset_index(drop=True)

test_df = test_df.drop(test_df[test_df['gold_label']=='-'].index)
test_df = test_df.sample(frac=0.2, random_state=42).reset_index(drop=True)

In [4]:
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')

Using cache found in C:\Users\statix/.cache\torch\hub\huggingface_pytorch-transformers_main
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
train_df.loc[:, 'sentence1'] = train_df.loc[:, 'sentence1'].apply(lambda s: clean_text(s))
train_df.loc[:, 'sentence2'] = train_df.loc[:, 'sentence2'].apply(lambda s: clean_text(s))

test_df.loc[:, 'sentence1'] = test_df.loc[:, 'sentence1'].apply(lambda s: clean_text(s))
test_df.loc[:, 'sentence2'] = test_df.loc[:, 'sentence2'].apply(lambda s: clean_text(s))

In [6]:
os.makedirs('data/train', exist_ok=True)
os.makedirs('data/test', exist_ok=True)

In [8]:
from dataset import Dataset, DataLoader

In [9]:
train_set = Dataset(train_df)

Using cache found in C:\Users\statix/.cache\torch\hub\huggingface_pytorch-transformers_main


In [10]:
train_set[0]

(tensor([  101,   170,  1372,  1104,  1685,  3287,  1107,  2221,  5947, 11449,
          2661,  1106,  5152,  1154,   170,  4528,  1106,  1147,  5947, 24165,
           102,  1103,  3287,  1132,  1773,  1107,  1103,  5282,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        dtype=torch.int32),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1], dtype=torch.int32),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0], dtype=torch.int32),
 tensor(0, dtype=torch.int32))

In [5]:
train_loader = DataLoader(train_set, shuffle=True, batch_size=2)

In [6]:
from models import BertModel

model = BertModel()

for tokens_tensor, segments_tensors, attention_tensor, y in train_loader:
    output = model(tokens_tensor, segments_tensors, attention_tensor)
    break

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


In [8]:
output

tensor([[0.3755, 0.3991, 0.2253],
        [0.3758, 0.4155, 0.2086]], grad_fn=<SoftmaxBackward0>)