# Сборка датасета

Датасет представлен в трёх файлах: `dev.jsonl`, `train.jsonl` и `test.jsonl`. Тестовый я пока не брал, все остальные сложил в одну папку, далее будет их чтение и превращение в человеческий формат.

In [124]:
from pathlib import Path
import pytorch_lightning as pl
import torch
import json
import os
import iobes
import pandas as pd
import re

from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from pytorch_lightning import Trainer

from nltk.data import load
from nltk.tokenize import TreebankWordTokenizer

from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup, logging
from tokenizers import BertWordPieceTokenizer
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint

In [15]:

class RuNNEPreprocessor:
    def __init__(self, DATASET_PATH=Path("public_dat")):
        # self.dev = pd.read_json(f'{DATASET_PATH}/dev.jsonl', lines=True)
        # self.test = pd.read_json(f'{DATASET_PATH}/test.jsonl', lines=True)
        self.train = pd.read_json(DATASET_PATH / 'train.jsonl', lines=True)
        self.train['sentences'] = self.train.sentences.apply(lambda x: x.replace('«', '\"').replace('»', '\"'))

    def get_pos(self, sent: str, tokens: list[str]):
        pos = []
        start = 0
        for token in tokens:
            t_start = sent.find(token, start)
            if t_start != -1:
                pos.append((t_start, t_start + len(token) - 1))
                start = t_start + len(token)
        return pos
    
    def label_preprocessing(self):
        self.all_labels = set(self.train.ners.apply(lambda x: [y[2] for y in x]).sum())
        self.dual_labels = {'B-' + x for x in self.all_labels}.union({'I-' + x for x in self.all_labels})
        print(len(self.all_labels), self.all_labels)
        self.label2id = {v: k for k, v in enumerate(self.dual_labels)}
        self.id2label = {k: v for k, v in enumerate(self.dual_labels)}
    
    def tokenizer_transform(self):

        tokenizer = TreebankWordTokenizer()
        punctuation_patterns = [
            (re.compile(r'(\w)\.'), '\1 \.'),
            (re.compile(r'([^\s])(-|\|)([^\s])'), r'\1 \2 \3'),
            (re.compile(r'(\d+):(\d)+'), r'\1 : \2')
        ]

        for pattern, replacement in punctuation_patterns:
            if (pattern, replacement) not in tokenizer.PUNCTUATION:
                tokenizer.PUNCTUATION.append((pattern, replacement))

        def tokenize_(sentence, tokenizer):
            tokens_ = tokenizer.tokenize(sentence)
            spans_ = tokenizer.span_tokenize(sentence)

            tokens_with_spans = [(token, start, end - 1) for token, (start, end) in zip(tokens_, spans_)]

            return tokens_with_spans
        def get_spans(sentence, tokenizer):
            spans_ = tokenizer.span_tokenize(sentence)
            spans = [(start, end - 1) for start, end in spans_]
            return spans
        
        self.train['tokens'] = self.train.sentences.apply(lambda x: tokenize_(x, tokenizer))
        self.train['spans'] = self.train.sentences.apply(lambda x: get_spans(x, tokenizer))

    def map_ner(self, ner: tuple[int, int, str], spans: list[tuple[int, int]]):
        def left_binary_search(spans, ner):
            left, right = 0, len(spans)
            while right - left > 1:
                mid = (right + left) >> 1
                if spans[mid][0] <= ner[0]:
                    left = mid
                else:
                    right = mid

            return left
        
        def right_binary_search(spans, ner):
            left, right = -1, len(spans) -1
            while right - left > 1:
                mid = (right + left) >> 1
                if spans[mid][1] >= ner[1]:
                    right = mid
                else:
                    left = mid
            return right

        left_span = left_binary_search(spans, ner)
        right_span = right_binary_search(spans, ner)
        
        assert left_span <= right_span, (left_span, spans[left_span], right_span, spans[right_span], ner)
        return (left_span, right_span), (spans[left_span][0], spans[right_span][1])
        
    def span_ners(self):
        self.train['span_ners'] = self.train.apply(lambda x: [self.map_ner(y, x['spans']) for y in x['ners']], axis=1)

    def create_labels(self):
        def create_labels_(ners, spans):
            labels = [[0 for _ in self.dual_labels] for _ in spans]
            for n in ners:
                (i, j), _ = self.map_ner(n, spans)
                labels[i][self.label2id['B-' + n[2]]] = 1
                for k in range(i + 1, j + 1):
                    labels[k][self.label2id['I-' + n[2]]] = 1
            return labels

        def label_tokens(row):
            return create_labels_(row.ners, row.spans)

        self.train['labels'] = self.train.apply(label_tokens, axis=1)


    def preprocess_data(self):
        self.label_preprocessing()
        self.tokenizer_transform()
        self.span_ners()
        self.create_labels()
        return self.train

# Пример использования
preprocessor = RuNNEPreprocessor()
train = preprocessor.preprocess_data()
train

Unnamed: 0,ners,sentences,id,tokens,spans,span_ners,labels
0,"[[0, 5, CITY], [16, 23, PERSON], [34, 41, PERS...",Бостон взорвали Тамерлан и Джохар Царнаевы из ...,0,"[(Бостон, 0, 5), (взорвали, 7, 14), (Тамерлан,...","[(0, 5), (7, 14), (16, 23), (25, 25), (27, 32)...","[((0, 0), (0, 5)), ((2, 2), (16, 23)), ((5, 5)...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[[21, 28, PROFESSION], [53, 67, ORGANIZATION],...",Умер избитый до комы гитарист и сооснователь г...,1,"[(Умер, 0, 3), (избитый, 5, 11), (до, 13, 14),...","[(0, 3), (5, 11), (13, 14), (16, 19), (21, 28)...","[((4, 4), (21, 28)), ((9, 10), (53, 67)), ((16...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[[0, 4, PERSON], [37, 42, COUNTRY], [47, 76, O...",Путин подписал распоряжение о выходе России из...,2,"[(Путин, 0, 4), (подписал, 6, 13), (распоряжен...","[(0, 4), (6, 13), (15, 26), (28, 28), (30, 35)...","[((0, 0), (0, 4)), ((5, 5), (37, 42)), ((7, 9)...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,"[[0, 11, PERSON], [36, 47, PROFESSION], [49, 6...",Бенедикт XVI носил кардиостимулятор\nПапа Римс...,3,"[(Бенедикт, 0, 7), (XVI, 9, 11), (носил, 13, 1...","[(0, 7), (9, 11), (13, 17), (19, 34), (36, 39)...","[((0, 1), (0, 11)), ((4, 5), (36, 47)), ((6, 7...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[[0, 4, PERSON], [17, 29, ORGANIZATION], [48, ...",Обама назначит в Верховный суд латиноамериканк...,4,"[(Обама, 0, 4), (назначит, 6, 13), (в, 15, 15)...","[(0, 4), (6, 13), (15, 15), (17, 25), (27, 29)...","[((0, 0), (0, 4)), ((3, 4), (17, 29)), ((6, 6)...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...,...,...,...
514,"[[42, 46, COUNTRY], [82, 87, COUNTRY], [104, 1...",Глава Малайзии: мы не хотим противостоять Кита...,514,"[(Глава, 0, 4), (Малайзии, 6, 13), (:, 14, 14)...","[(0, 4), (6, 13), (14, 14), (16, 17), (19, 20)...","[((7, 7), (42, 46)), ((13, 13), (82, 87)), ((1...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
515,"[[1, 4, PRODUCT], [31, 33, FACILITY], [35, 44,...","""Союз"" впервые пристыковался к МКС за 6 часов\...",515,"[(``, 0, 0), (Союз, 1, 4), ('', 5, 5), (впервы...","[(0, 0), (1, 4), (5, 5), (7, 13), (15, 27), (2...","[((1, 1), (1, 4)), ((6, 6), (31, 33)), ((7, 9)...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
516,"[[0, 4, PERSON], [8, 12, PERSON], [45, 52, AGE...",Трамп и Путин сделали совместное заявление к 7...,516,"[(Трамп, 0, 4), (и, 6, 6), (Путин, 8, 12), (сд...","[(0, 4), (6, 6), (8, 12), (14, 20), (22, 31), ...","[((0, 0), (0, 4)), ((2, 2), (8, 12)), ((7, 9),...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
517,"[[0, 9, NATIONALITY], [58, 72, PERSON], [101, ...",Российский магнат устроил самую дорогую свадьб...,517,"[(Российский, 0, 9), (магнат, 11, 16), (устрои...","[(0, 9), (11, 16), (18, 24), (26, 30), (32, 38...","[((0, 0), (0, 9)), ((8, 9), (58, 72)), ((12, 1...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,..."


In [16]:
t = train.copy()
t.tokens = t.tokens.apply(lambda x: [y[0] for y in x])
t = t[['tokens', 'labels']]

all_tokens = t.tokens.sum()
all_labels = t.labels.sum()

len(all_tokens), len(all_labels)

(135505, 135505)

In [17]:
WINDOW = 32

sequences, labels = [], []

for t in range(len(all_tokens) - WINDOW):
    sequences.append(all_tokens[t:t+WINDOW])
    labels.append(all_labels[t:t+WINDOW])
    
len(sequences), len(labels)

(135473, 135473)

In [18]:
batched = pd.DataFrame([*zip(sequences, labels)], columns=['tokens', 'label'])
batched.head()

Unnamed: 0,tokens,label
0,"[Бостон, взорвали, Тамерлан, и, Джохар, Царнае...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"[взорвали, Тамерлан, и, Джохар, Царнаевы, из, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
2,"[Тамерлан, и, Джохар, Царнаевы, из, Северного,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,..."
3,"[и, Джохар, Царнаевы, из, Северного, Кавказа, ...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,"[Джохар, Царнаевы, из, Северного, Кавказа, 19,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [19]:
train_split = batched.sample(frac=0.9)
val_split = batched[~batched.index.isin(train_split.index)]

len(train_split), len(val_split), len(train)

(121926, 13547, 519)

In [20]:
token_label = train_split.copy()


In [21]:
# from datasets import DatasetDict, Dataset

# dataset = DatasetDict({
#     'train': Dataset.from_pandas(train_split.reset_index(drop=True)),
#     'val': Dataset.from_pandas(val_split.reset_index(drop=True))
# })

# dataset

In [22]:
from transformers import AutoTokenizer, BertModel, BertConfig

bert_tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
import torch
import torch.nn as nn

class BertForNER(BertModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)
        self.classifier_head = nn.Linear(config.hidden_size, 58)
        self.__pos_weight = torch.full((1, 1, 58), 58)
        
    def forward(self, return_loss = True, **kwargs):
        labels = kwargs.pop('labels', None)
        kwargs.pop('output_hidden_states', None)
        
        output = super().forward(**kwargs, return_dict=True, output_hidden_states=True)
        preds = self.classifier_head(output.hidden_states[-1])
        
        output['predictions'] = preds
        if labels is not None:
            loss = torch.nn.functional.binary_cross_entropy_with_logits(
                preds,
                labels,
                pos_weight=self.__pos_weight.to(preds.device)
            )     
        else:
            loss = None       
    
        return loss, output if labels is not None and return_loss else output

In [24]:
model = BertForNER.from_pretrained('DeepPavlov/rubert-base-cased')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForNER: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForNER from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNER from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForNER were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and

In [25]:

def prepare_dataset(batch):
    tokens = [bert_tokenizer.convert_tokens_to_ids(x) for x in batch['tokens']]
        
    input_ids = torch.tensor(tokens, dtype=torch.long)
    labels = torch.tensor(batch['label'], dtype=torch.float32)
    
    return {
        'input_ids': input_ids,
        'labels': labels
    }    

In [35]:
# dataset.map(prepare_dataset, batched=True, batch_size=256).remove_columns('tokens').save_to_disk('./dataset/tokenized')

Map:   0%|          | 0/121926 [00:00<?, ? examples/s]

Map: 100%|██████████| 121926/121926 [09:06<00:00, 223.05 examples/s]
Map: 100%|██████████| 13547/13547 [01:26<00:00, 156.45 examples/s]
Saving the dataset (6/6 shards): 100%|██████████| 121926/121926 [00:06<00:00, 18711.70 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 13547/13547 [00:00<00:00, 37625.06 examples/s]


In [28]:
from datasets import DatasetDict

ds = DatasetDict.load_from_disk('./dataset/tokenized')

In [29]:
ds = ds.remove_columns('label')

In [26]:
%env CUDA_VISIBLE_DEVICES=1,2,3

env: CUDA_VISIBLE_DEVICES=1,2,3


In [26]:
import numpy as np 
def compute_metrics(outputs):
    predictions: np.ndarray = outputs.predicitions
    labels: np.ndarray = outputs.labels
    
    predictions = predictions.flatten()
    labels = predictions.flatten()
    accuracy = ((predictions == 1) & (labels == 1)).sum() / max(1, (predictions == 1).sum())

    return {
        'accuracy': accuracy
    }

In [52]:
from tqdm import tqdm

device = 'cuda:7' if torch.cuda.is_available() else 'cpu'
model.to(device)
epochs = 3
accuracy, recall, precision, n = 0, 0, 0, 0
for epoch in tqdm(range(epochs)):
    for batch in tqdm(ds['val']):
        tokens = torch.tensor(batch['input_ids'], dtype=torch.long, device=device).unsqueeze(0)
        labels = torch.tensor(batch['labels'], dtype=torch.long, device=device).unsqueeze(0)
        outputs = model(input_ids=tokens)[1].predictions

        
        tags = (torch.nn.functional.sigmoid(outputs).squeeze() > .5).to(torch.long)
        tp = ((labels == 1) & (tags == 1)).sum()
        tn = ((labels == 0) & (tags == 0)).sum()
        fp = ((labels == 0) & (tags == 1)).sum()
        fn = ((labels == 1) & (tags == 0)).sum()
        precision += tp / (tp + fp)
        recall += tp / (tp + fn)
        accuracy += (tp + fp) / labels.numel()
        n += 1
    
accuracy / n, precision / n, recall / n

100%|██████████| 13547/13547 [25:05<00:00,  9.00it/s]
100%|██████████| 13547/13547 [28:49<00:00,  7.83it/s]
100%|██████████| 13547/13547 [25:16<00:00,  8.93it/s]
100%|██████████| 3/3 [1:19:11<00:00, 1583.96s/it]


(tensor(0.4728), tensor(0.0082), tensor(nan))

In [121]:
import torch

def convert_to_submit(labels: list[torch.Tensor], spans: list[tuple[int, int]]):
    # labels of shape (sequence_length, num_classes), binary tensor (0 or 1)
    # spans are pairs (begin, end)
    # assert len(labels) == len(spans)
    
    start_label_ids = {v for k, v in label2id.items() if k.startswith('B-')}
    # segment_label_ids = {v for k, v in label2id if k.startswith('I-')}
    
    ners = []
    current_ners = []
    for i, label in enumerate(labels):
        index = torch.arange(58)
        predicted = index[label == 1].tolist()
        expanded = [False] * len(current_ners)
        new_ners = []
        for p in predicted:
            if p not in start_label_ids:
                for j, c in enumerate(current_ners):
                    if c[0] == id2label[p][2:]:
                        # expanding
                        c = (c[0], c[1], spans[i][1])
                        expanded[j] = True
            else:
                new_ners.append((id2label[p][2:], *spans[i]))
        
        ners.extend([(c[1], c[2], c[0]) for j, c in enumerate(current_ners) if not expanded[j]])
        current_ners = [c for j, c in enumerate(current_ners) if expanded[j]]
        current_ners += new_ners
        
    return ners
        

In [86]:
text = train.sentences.iloc[0][:256]
tokens = tokenizer.tokenize(text)
spans = [*tokenizer.span_tokenize(text)]
bert_tokens = list(map(bert_tokenizer.convert_tokens_to_ids, tokens))

text, tokens, spans, bert_tokens

('Бостон взорвали Тамерлан и Джохар Царнаевы из Северного Кавказа\n\n19 апреля 2013 года в пригороде Бостона  проходит спецоперация по поимке 19-летнего Джохара Царнаева, подозреваемого в теракте на Бостонском марафоне 15 апреля и в смертельном ранении полицей',
 ['Бостон',
  'взорвали',
  'Тамерлан',
  'и',
  'Джохар',
  'Царнаевы',
  'из',
  'Северного',
  'Кавказа',
  '19',
  'апреля',
  '2013',
  'года',
  'в',
  'пригороде',
  'Бостона',
  'проходит',
  'спецоперация',
  'по',
  'поимке',
  '19',
  '-',
  'летнего',
  'Джохара',
  'Царнаева',
  ',',
  'подозреваемого',
  'в',
  'теракте',
  'на',
  'Бостонском',
  'марафоне',
  '15',
  'апреля',
  'и',
  'в',
  'смертельном',
  'ранении',
  'полицей'],
 [(0, 6),
  (7, 15),
  (16, 24),
  (25, 26),
  (27, 33),
  (34, 42),
  (43, 45),
  (46, 55),
  (56, 63),
  (65, 67),
  (68, 74),
  (75, 79),
  (80, 84),
  (85, 86),
  (87, 96),
  (97, 104),
  (106, 114),
  (115, 127),
  (128, 130),
  (131, 137),
  (138, 140),
  (140, 141),
  (141, 1

In [87]:
outputs = model(input_ids=torch.tensor(bert_tokens, dtype=torch.long).unsqueeze(0))[1].predictions
outputs.shape

torch.Size([1, 39, 58])

In [88]:
tags = (torch.nn.functional.sigmoid(outputs) > 0.69).to(torch.long)
tags.shape

torch.Size([1, 39, 58])

In [89]:
(tags == 1).sum()

tensor(36)

In [95]:
tags.squeeze().size(), spans

(torch.Size([291, 58]),
 [(0, 6),
  (7, 15),
  (16, 23),
  (24, 31),
  (32, 36),
  (37, 38),
  (39, 45),
  (46, 49),
  (50, 57),
  (58, 62),
  (63, 72),
  (73, 79),
  (80, 82),
  (83, 88),
  (89, 98),
  (99, 104),
  (105, 107),
  (108, 115),
  (116, 117),
  (118, 124),
  (124, 125),
  (126, 134),
  (135, 139),
  (140, 144),
  (145, 146),
  (147, 153),
  (154, 162),
  (163, 169),
  (170, 185),
  (186, 192),
  (193, 200),
  (201, 206),
  (206, 207),
  (208, 220),
  (221, 226),
  (227, 236),
  (237, 243),
  (243, 244),
  (245, 256),
  (257, 262),
  (263, 270),
  (271, 278),
  (279, 285),
  (286, 294),
  (294, 295),
  (297, 304),
  (305, 311),
  (312, 313),
  (314, 321),
  (322, 326),
  (327, 337),
  (338, 339),
  (340, 347),
  (347, 348),
  (349, 354),
  (355, 363),
  (364, 366),
  (367, 380),
  (381, 390),
  (391, 399),
  (400, 401),
  (402, 404),
  (405, 412),
  (412, 413),
  (414, 415),
  (416, 424),
  (425, 426),
  (427, 432),
  (432, 433),
  (433, 439),
  (440, 451),
  (452, 456),
  

In [91]:
preds = convert_to_submit(tags.squeeze(), spans)

In [70]:
print([(x[0], text[x[1]:x[2]]) for x in preds])

[('STATE_OR_PROVINCE', 'Бостон'), ('ORGANIZATION', 'взорвали'), ('ORGANIZATION', 'Тамерлан'), ('PERCENT', 'и'), ('ORGANIZATION', 'и'), ('TIME', 'Джохар'), ('FAMILY', 'из'), ('ORGANIZATION', '19'), ('COUNTRY', 'пригороде'), ('MONEY', '-'), ('PENALTY', 'марафоне')]


In [73]:
test_dataset = pd.read_json('public_dat/test.jsonl', lines=True)
test_dataset

Unnamed: 0,senences,id
0,Владелец «Бирмингема» получил шесть лет тюрьмы...,584
1,Акция протеста на Майдане Независимости объявл...,585
2,Фольксваген может перейти под контроль Порше \...,586
3,В Москве покажут фильмы Чарли Чаплина с живой ...,587
4,Чулпан Хаматова сыграет главную роль в фильме ...,588
...,...,...
60,ОБСЕ назвала референдум о статусе Крыма незако...,644
61,Египетского студента могут выслать из страны з...,645
62,Геннадий Онищенко отправлен в отставку\nГеннад...,646
63,Племянник Алишера Усманова разбился в ДТП\nВид...,647


In [122]:
ans = []
for id, text in zip(test_dataset.id, test_dataset.senences):
    tokens = tokenizer.tokenize(text)
    spans = [*tokenizer.span_tokenize(text)]
    bert_tokens = list(map(bert_tokenizer.convert_tokens_to_ids, tokens))
    if len(bert_tokens)>512:
        bert_tokens=bert_tokens[:512]
    outputs = model(input_ids=torch.tensor(bert_tokens, dtype=torch.long).unsqueeze(0))[1].predictions
    tags = (torch.nn.functional.sigmoid(outputs) > 0.69).to(torch.long)
    # print(tags)
    # print(type(spans))
    preds = convert_to_submit(tags.squeeze(), spans)
    ans.append({'id':id, 'ners': preds})

print(ans)



[{'id': 584, 'ners': [(9, 21, 'TIME'), (9, 21, 'COUNTRY'), (9, 21, 'ORGANIZATION'), (22, 29, 'PERCENT'), (22, 29, 'ORGANIZATION'), (30, 35, 'TIME'), (36, 39, 'PERCENT'), (40, 46, 'PERCENT'), (40, 46, 'TIME'), (126, 127, 'TIME'), (145, 148, 'TIME'), (145, 148, 'ORGANIZATION'), (149, 157, 'TIME'), (169, 178, 'TIME'), (179, 190, 'TIME'), (197, 207, 'TIME'), (208, 213, 'TIME'), (214, 221, 'TIME'), (222, 226, 'TIME'), (228, 234, 'TIME'), (235, 240, 'TIME'), (240, 241, 'PERCENT'), (242, 247, 'TIME'), (248, 249, 'TIME'), (248, 249, 'ORGANIZATION'), (250, 259, 'TIME'), (260, 270, 'TIME'), (271, 272, 'TIME'), (271, 272, 'STATE_OR_PROVINCE'), (273, 279, 'TIME'), (280, 283, 'TIME'), (285, 291, 'TIME'), (292, 294, 'TIME'), (292, 294, 'PENALTY'), (294, 295, 'PERCENT'), (294, 295, 'TIME'), (296, 297, 'TIME'), (298, 303, 'TIME'), (304, 309, 'TIME'), (304, 309, 'CRIME'), (310, 319, 'PERCENT'), (310, 319, 'TIME'), (320, 330, 'TIME'), (331, 333, 'TIME'), (350, 352, 'TIME'), (374, 381, 'TIME'), (403, 405

In [123]:
import json

with open('test.jsonl', 'w', encoding='utf-8') as f:
    for item in ans:
        # Convert the item to a JSON string and write it to the file
        f.write(json.dumps(item) + '\n')

In [119]:
ans.to_json('test.jsonl')

AttributeError: 'list' object has no attribute 'to_json'