In [1]:
import os
import re
import nltk
import json
import pickle
from tqdm import tqdm
import numpy as np
import torch
import sqlite3
import pandas as pd
import random
import transformers
import logging

In [2]:
import itertools
import sys
if '/home/ryparmar/experimental-martin/pretraining/src/' not in sys.path:
    sys.path.append('/home/ryparmar/experimental-martin/pretraining/src')

import util, io_util, eval
from torch.utils.data import DataLoader, TensorDataset
from model import Encoder as Model

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
class Config:
    def __init__(self):
        self.mode = 'finetuning'
        self.task = 'BFS+ICT'
        self.claims_path = "/mnt/data/factcheck/CTK/dataset/v2.1/nfc" #"/mnt/data/factcheck/CTK/par5/ctk-data"
        self.articles_path = "/mnt/data/factcheck/CTK/par5/interim/ctk_filtered.db"
        self.articles_chunks_path = "/mnt/data/factcheck/ict_chunked_data/ids-chunks-288-finetuning-ctk_filtered.pkl"

#         self.claims_path = "/mnt/data/factcheck/fever/data-cs/fever-data"
#         self.claims_path = "/home/ryparmar/fever-cs-deepl"
#         self.articles_path = "/mnt/data/factcheck/fever/data-cs/fever/fever.db"
#         self.articles_chunks_path = '/mnt/data/factcheck/ict_chunked_data/ids-chunks-288-finetuning-fever.pkl'
        self.model_weight = "/home/ryparmar/trained_models/debug.w"
        self.bert_model = "bert-base-multilingual-cased"
        self.learning_rate = 5e-6
        self.max_seq = 288
        self.epoch = 1
        self.bs = 32
        self.test_bs = 64
        self.remove_percent = 0.9
        self.use_cuda = True if torch.cuda.is_available() else False
        self.devices = "0" if torch.cuda.is_available() else ""
        self.continue_training = "/home/ryparmar/trained_models/mbert_pre_ctk_10ep_ict+bfs_1e-5_288"
#         "/home/ryparmar/trained_models/mbert_wiki_pre_10ep-bfs_10ep-ict_1e-5_288"  #False
        self.logger = logging.getLogger(__name__)
    def add(self, name, val):
        if name == 'cls_token_id':
            self.cls_token_id = val
        if name == 'pad_token_id':
            self.pad_token_id = val
        if name == 'device':
            self.device = val
        
config = Config()

In [5]:
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)


def instantiate_model(config, tokenizer):
    configure_devices(config)
    model = Model(config)
    optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0)
    metrics = None

    if config.continue_training:
        state_dict = torch.load(config.continue_training, map_location='cpu')
        model.load_state_dict(state_dict['model'])
        if 'optimizer_state_dict' in state_dict:
            optimizer.load_state_dict(state_dict['optimizer_state_dict'])
        
        try:
            print(f"Loaded model:\nEpochs: {state_dict['epoch']}\nLoss: {state_dict['loss']}\n", 
                  f"Recall: {state_dict['rec']}\nMRR: {state_dict['mrr']}")
        except:
            pass
        
    if config.use_cuda:
        model = model.cuda()
        optimizer_to(optimizer, config.device)
        model = torch.nn.DataParallel(model, device_ids=config.devices)
    return model, optimizer, metrics


def configure_devices(config):
    config.devices = [int(device) for device in range(torch.cuda.device_count())]
    config.device = config.devices[0] if config.use_cuda else "cpu"


def get_loader(data, batch_size):
    data = TensorDataset(data)
    return DataLoader(data,
                      batch_size=batch_size,
                      shuffle=True,
                      sampler=None, drop_last=True)

def ids2docs(ids, id2doc: dict):
    return [id2doc[int(i)] for i in ids]

In [6]:
tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

config.add('cls_token_id', tokenizer.encode(tokenizer.cls_token, add_special_tokens=False)[0])
config.add('pad_token_id', tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0])

In [7]:
model, optimizer, metrics = instantiate_model(config, tokenizer)
loss_fn = torch.nn.CrossEntropyLoss()

Loaded model:
Epochs: [0, 1, 2, 3]
Loss: [0.6182721752268022, 0.4800995283004084, 0.4311975429328025, 0.4015651965180047]
 Recall: [0.517402, 0.433843, 0.493099, 0.392289]
MRR: [0.29268, 0.217989, 0.262663, 0.177827]


In [8]:
metrics = eval.Metrics(metrics)

In [9]:
def tok(x):
    print(tokenizer.convert_ids_to_tokens(x))

# Main

In [10]:
doc_chunks = util.make_chunks(config.articles_path, tokenizer, config, save_chunks=True)
articles_ids = util.get_par_ids(doc_chunks) if 'CTK' in config.articles_path else list(doc_chunks.keys())

In [14]:
s = 0
for k,v in doc_chunks.items():
    s += len(v)

In [15]:
s

13619573

In [12]:
len(doc_chunks)

2507454

In [11]:
# doc_chunks['T201602040298002'].keys()

In [12]:
# tok(doc_chunks['T201602040298002'][7])

In [13]:
doc_chunks, chunks_mask = util.process_chunks(doc_chunks, config)
dev_chunks, dev_chunks_mask, dev_articles_ids = doc_chunks, chunks_mask, articles_ids

Padding chunks...: 100%|██████████| 2507454/2507454 [15:12<00:00, 2746.92it/s] 


In [14]:
# doc_chunks['T201602040298002'].keys()

In [15]:
# tok(doc_chunks['T201602040298002'][7])

In [16]:
claims_dev, evidence_dev, labels_dev = util.load_claims('dev', config)
claims_train, evidence_train, labels_train = util.load_claims('train', config)
claims_train, evidence_train, labels_train = util.remove_unverifiable_claims(claims_train,
                                                                            evidence_train,
                                                                            labels_train, config)
claims_dev, evidence_dev, labels_dev = util.remove_unverifiable_claims(claims_dev,
                                                                        evidence_dev,
                                                                        labels_dev, config)

claims_train, \
evidence_train, \
labels_train = util.remove_invalid_claims(claims_train, evidence_train, labels_train, articles_ids, config)

claims_dev, \
evidence_dev, \
labels_dev = util.remove_invalid_claims(claims_dev, evidence_dev, labels_dev, articles_ids, config)

claims_train, claims_train_mask = util.process_claims(claims_train, tokenizer, config, _pad_max=True)
claims_dev, claims_dev_mask = util.process_claims(claims_dev, tokenizer, config, _pad_max=True)

Validating evidences: 1it [00:00,  3.76it/s]

Loaded 300 claims from dev split.
Loaded 2124 claims from train split.


Validating evidences: 1763it [03:22,  8.71it/s]
Validating evidences: 218it [00:24,  8.77it/s]


In [17]:
# doc_chunks['Astronomie']

In [18]:
claims_train.shape

torch.Size([1712, 288])

# Evaluation Initial

In [19]:
id2doc = {i: doc_id for i, (doc_id, _) in enumerate(doc_chunks.items())} if isinstance(doc_chunks, dict) else []
    
loader = (get_loader(torch.tensor([i for i in range(len(claims_train))]), config.bs) if config.mode == 'finetuning'
          else get_loader(torch.tensor([i for i in range(len(doc_chunks))]), config.bs))

In [20]:
import random

In [21]:
def get_sample_keys(keys: list, sample=0.01):
        keys = list(dev_chunks.keys())
        return random.sample(keys, round(len(keys)*sample))

def get_subset(d: dict, keys: list):
    return {k: d[k] for k in keys}
    
sample_keys = get_sample_keys(list(dev_chunks.keys()))

In [22]:
eval_claim_embed, eval_doc_embed = eval.evaluation_preprocessing(claims_dev, claims_dev_mask, 
                                                                    get_subset(dev_chunks, sample_keys), 
                                                                    get_subset(dev_chunks_mask, sample_keys), 
                                                                    model, config)

Generating chunks embeddings...: 100%|██████████| 25075/25075 [00:00<00:00, 50592.66it/s]
Embedding given chunks...: 100%|██████████| 2125/2125 [11:33<00:00,  3.06it/s]
Embedding given chunks...: 100%|██████████| 4/4 [00:01<00:00,  3.74it/s]


In [23]:
# eval_claim_embed, eval_doc_embed = eval.evaluation_preprocessing(claims_dev, claims_dev_mask, 
#                                                                     dev_chunks_s, dev_chunks_mask_s, model, config)
# # del doc_chunks, chunks_mask

In [24]:
# eval_claim_embed, eval_doc_embed = np.load('eval_claim_embed_deepl.npy'), np.load('eval_doc_embed_deepl.npy')

In [25]:
# np.save('claim_embeddings_ctk', claim_embeddings)

In [26]:
precision, recall, f1, mrr = eval.retriever_score(eval_doc_embed, dev_articles_ids, eval_claim_embed, 
                                                evidence_dev, labels_dev, config, k=20)
print(f"F1: {f1}\tPrecision@{20}: {precision}\tRecall@{20}: {recall}\tMRR@{20}: {mrr}")

Calculating evaluation metrics: 208it [05:15,  1.51s/it]

F1: 0.0	Precision@20: 0.0	Recall@20: 0.0	MRR@20: 0.0





# Training

In [21]:
loader = ( get_loader(torch.tensor([i for i in range(len(claims_train))]), config.bs) 
               if config.mode == 'finetuning'
               else get_loader(torch.tensor([i for i in range(len(doc_chunks))]), config.bs))

id2doc = {i: doc_id for i, (doc_id, _) in enumerate(doc_chunks.items())} if isinstance(doc_chunks, dict) else []

In [22]:
def get_rand_chunk_id(chunks: list, ):
    """Returns a random chunk id from a list of chunks."""
    if len(chunks) > 0:
        rand_chunk_id = random.sample(range(len(chunks)), 1)
        return rand_chunk_id[0]
    else:
        print("ERROR -- EMPTY CHUNKS ON INPUT!")

def split_par_id(doc_id):
    tmp = doc_id.split('_')
    assert len(tmp) == 2
    return tmp[0], tmp[1] 
    
def bfs_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk):
    docs_pad, docs_mask = [], []
    for ev_id in evidence:
        if is_ctk:
            doc_id, chunk_id = split_par_id(ev_id) 
        else:
            ev_id, 0
        print(doc_id, chunk_id)
        rand_chunk_id = get_rand_chunk_id(list(doc_chunks[doc_id].keys()))
        docs_pad.append(doc_chunks[doc_id][rand_chunk_id])
        docs_mask.append(doc_chunks_mask[doc_id][rand_chunk_id])
    return docs_pad, docs_mask


def ict_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk):
    docs_pad, docs_mask = [], []
    for ev_id in evidence:
        if is_ctk:
            doc_id, chunk_id = split_par_id(ev_id) 
        else:
            ev_id, 0
        print(doc_id, chunk_id)
        docs_pad.append(doc_chunks[doc_id][int(chunk_id)])
        docs_mask.append(doc_chunks_mask[doc_id][int(chunk_id)])
    return docs_pad, docs_mask

def get_finetuning_batch(batch_ids, claims, claims_mask, evidence, doc_chunks, doc_chunks_mask, doc_ids, config):
    """
        Note: some negatives could be positives, as some claims have multiple supporting documents,
        but that's being ignored here for convenience.
    """
    claims_pad = [claims[i] for i in batch_ids]
    claims_mask = [claims_mask[i] for i in batch_ids]
    evidence = [evidence[i] for i in batch_ids]  # document ids
    # doc_batch_ids = [doc_ids[i] for i in batch_ids]

    # get documents
    # 1 evidence = 1 doc id, see remove_unverifiable_claims
    is_ctk = 'CTK' in config.articles_path
    if config.task.upper() == 'BFS':  # works only for CTK data
        docs_pad, docs_mask = bfs_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk)
    elif config.task.upper() == 'ICT':
        docs_pad, docs_mask = ict_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk)
    else:
        bfs = random.randint(0, 1)  # randomly choose between bfs / ict
        docs_pad, docs_mask = (bfs_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk)
                               if bfs else 
                               ict_finetuning_contexts(doc_chunks, doc_chunks_mask, evidence, is_ctk))

    c_p = torch.stack(claims_pad, axis=0).to(torch.int64).to(config.device)
    c_m = torch.stack(claims_mask, axis=0).to(torch.int64).to(config.device)
    d_p = torch.stack(docs_pad, axis=0).to(torch.int64).to(config.device)
    d_m = torch.stack(docs_mask, axis=0).to(torch.int64).to(config.device)
    return c_p, c_m, d_p, d_m

In [32]:
batch = next(iter(loader))
batch = batch[0]
query, query_mask, \
context, context_mask = get_finetuning_batch(batch, claims_train, claims_train_mask, evidence_train,
                                                pad_doc_chunks, chunks_mask, articles_ids, config)
print(f"{query.shape} {context.shape}")
print(f"{query} {context}")

T200609190631901 2
20020512E01126 6
T201306190678401 3
20021121E03327 5
T201602020591003 1
20020510E05757 2
T200705160360901 2
20041130E05667 12
T201802060970201 7
20001120F02543 1
20051013F01195 6
T201308200032101 1
T200609190631901 1
20010601E01470 4
T200603070946601 2
T201608170388501 5
T201008200353401 1
20041215F00629 1
T201602170965801 4
20051004F01647 0
T201512260018201 8
T201512260018201 7
20021118E02122 1
20050128F00337 5
T200708090259701 4
T200801070228401 1
T201208100100702 8
T200603070946601 2
T200710230390401 6
20051107F00813 1
T200805090117001 5
T200710220515903 15
torch.Size([32, 288]) torch.Size([32, 288])
tensor([[   101,  23104,  10921,  ...,      0,      0,      0],
        [   101,    139,  85430,  ...,      0,      0,      0],
        [   101,    148,  16852,  ...,      0,      0,      0],
        ...,
        [   101,    152,  70673,  ...,      0,      0,      0],
        [   101,  35248, 104693,  ...,      0,      0,      0],
        [   101,  46361,  48832,  ...

In [23]:
epoch_num = -1
for epoch_num in range(config.epoch):
    model.train()
    batch_num = len(loader)
    num_training_examples, running_loss = 0, 0.0
    for batch in tqdm(loader, total=batch_num):
        optimizer.zero_grad()
        batch = batch[0]
        num_training_examples += batch.size(0)
        if config.mode == 'finetuning':
            query, query_mask, \
            context, context_mask = get_finetuning_batch(batch, claims_train, claims_train_mask, evidence_train,
                                                            doc_chunks, chunks_mask, articles_ids, config)
        else:
            query, query_mask, \
            context, context_mask = util.get_pretraining_batch(ids2docs(batch, id2doc), doc_chunks, 
                                                                        tokenizer, config)

        query_cls_out = model(x=query, x_mask=query_mask)
        context_cls_out = model(x=context, x_mask=context_mask)
        logit = torch.matmul(query_cls_out, context_cls_out.transpose(-2, -1))
        correct_class = torch.tensor([i for i in range(len(query))]).long().to(config.device)
        loss = loss_fn(logit, correct_class)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch.size(0)
        epoch_avg_loss = running_loss / num_training_examples

        print(f"epoch_avg_loss: {epoch_avg_loss}\nrunning_loss: {running_loss}")

  0%|          | 0/28 [00:00<?, ?it/s]

T201503290538903 1
T201306190678401 3
20051013F01195 6
20050128F00337 5
T201602040298002 7
20030130E01450 1
20020618F00388 1
20030307E00262 2
20030427E00825 4
T200805090117001 5
20010220F01423 4
20030612F02104 1
20051107F00813 1
T201609260734602 1
T200709190711502 7
T201210160213901 1
20050711F01819 5
20050601F01415 3
T201306130258101 1
T201003150613402 3
20011220E01142 4
T200803070230601 3
T200609190631901 1
T201406160291201 6
20030612F02104 2
20030427E00825 4
T201512260018201 1
T200805190455401 3
20010711F01495 1
T200705160360901 1
20060118F01474 2
T201306190678401 3


  4%|▎         | 1/28 [03:38<1:38:27, 218.79s/it]

epoch_avg_loss: 0.9218634366989136
running_loss: 29.499629974365234
T201510070563901 2
T200805190455401 1
T201710180270101 2
T200709190711502 9
T201210160213901 4
20040213E02401 5
20021121E03327 5
T201604210827502 1
20001120F02543 1
20020624F03306 2
T200603070906101 3
T201510070563901 2
20010311E01129 3
T201509140642301 4
T200609190631901 2
20040111E00954 1
T201004280441801 3
20040111E00954 1
T200710230390401 2
20000612F00401 5
20030611F01030 4
T201611231017401 4
20030427E00825 2
T200603070946601 5
T200603070946601 2
20020723F01050 7
20010101E00080 6
T201511090723301 1
20060113E00817 3
20020422F01825 1
20010510F01902 1
T201004210977301 10


  4%|▎         | 1/28 [07:19<3:17:54, 439.80s/it]


KeyboardInterrupt: 

# Evaluation

In [29]:
type(claims_dev)

torch.Tensor

In [36]:
list(dev_chunks.keys())[0]

'T201608150566901'

In [37]:
type(dev_chunks['T201608150566901'])

dict

In [45]:
embeddings, doc_ids, par_ids = [], [], []
chunks, masks = dev_chunks, chunks_mask
if isinstance(dev_chunks, dict) and isinstance(chunks_mask, dict):
    chunks, masks = [], []
    for doc_id, doc in tqdm(dev_chunks.items(), desc='Generating chunks embeddings...'):
        doc_ids.append(doc_id)
        if isinstance(doc, list):
            chunks.append(util.flatten_list(doc))
            masks.append(util.flatten_list(chunks_mask[doc_id]))
        elif isinstance(doc, dict):
            for par_id, par in doc.items():
                par_ids.append(par_id)
                if isinstance(par, list):
                    chunks.append(util.flatten_list(par))
                    masks.append(util.flatten_list(chunks_mask[doc_id][par_id]))
                else:
                    chunks.append(torch.flatten(par))
                    masks.append(torch.flatten(chunks_mask[doc_id][par_id]))
        else:
            chunks.append(torch.flatten(doc))
            masks.append(torch.flatten(chunks_mask[doc_id]))
elif isinstance(dev_chunks, list) and isinstance(masks, list):
    chunks = torch.stack(chunks, axis=0)
    masks = torch.stack(masks, axis=0)

# embeddings = encode_chunks(chunks, masks, model, batch_size)

Generating chunks embeddings...: 100%|██████████| 2507454/2507454 [00:39<00:00, 62906.39it/s]


In [58]:
chunks = torch.stack(chunks, axis=0)
masks = torch.stack(masks, axis=0)
    

In [59]:
type(chunks)#[0]

torch.Tensor

In [60]:
chunks.shape

torch.Size([13619573, 288])

In [65]:
claim_embeddings = eval.encode(claims_dev, claims_dev_mask, model, batch_size=config.test_bs)

Embedding given chunks...: 100%|██████████| 3/3 [02:27<00:00, 49.20s/it]


In [61]:
document_embeddings = eval.encode_chunks(chunks, masks, model, 32)

Embedding given chunks...:   0%|          | 2/425612 [01:14<4418:45:07, 37.38s/it]


KeyboardInterrupt: 

In [40]:
torch.flatten(dev_chunks['T201608150566901'][0])

tensor([  101, 85566, 91931, 20819,   190, 10730, 33302, 33705, 14590, 10419,
        10132, 79420, 81592, 10147,   280, 89424, 10126, 21484, 10545, 20639,
        11163, 38320,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [34]:
# Evaluation
eval_claim_embed, eval_doc_embed = eval.evaluation_preprocessing(claims_dev, claims_dev_mask, 
                                                                dev_chunks, dev_chunks_mask, model, config)

precision, recall, f1, mrr = eval.retriever_score(eval_doc_embed, dev_articles_ids, eval_claim_embed, 
                                                evidence_dev, labels_dev, config, k=20)
print(f"F1: {f1}\tPrecision@{20}: {precision}\tRecall@{20}: {recall}\tMRR@{20}: {mrr}")

Generating chunks embeddings...:   0%|          | 0/2507454 [00:00<?, ?it/s]


TypeError: flatten(): argument 'input' (position 1) must be Tensor, not dict

In [None]:
# print(type(sdev_ch), type(sdev_m))

In [None]:
# eval_claim_embeddings, eval_document_embeddings = eval.evaluation_preprocessing(claims_dev, claims_dev_mask, 
#                                                                                 sdev_ch, sdev_m, model, config)

In [None]:
# print(eval_claim_embeddings.shape, eval_document_embeddings.shape)

In [None]:
# kk = 20
# precision, recall, f1, mrr = eval.retriever_score(eval_document_embeddings,dev_articles_ids, eval_claim_embeddings, 
#                                             evidence_dev, labels_dev, config, k=kk)
# print(f"F1: {f1}\tPrecision@{kk}: {precision}\tRecall@{kk}: {recall}\tMRR@{kk}: {mrr}")

# Paragraph len distribution

In [None]:
# path = "/mnt/data/factcheck/fever/data-cs/fever/fever.db"
path = "/mnt/data/factcheck/CTK/par5/interim/ctk_filtered.db"

In [None]:
# process abstract wiki
pars, par_ids = io_util.load_db(path)  # returns paragraphs and paragraph ids
docs = util.transform_fever_wiki(pars, par_ids)  # group paragraphs of articles
docs_tokenized = util.tokenize_documents(docs, tokenizer)

In [None]:
# process full len wiki
wiki_json = io_util.load_json(path)
docs = transform_wiki(wiki_json)
docs_tokenized = tokenize_documents(docs, tokenizer)

In [None]:
# process ctk
pars, par_ids = io_util.load_db(path)  # returns paragraphs and paragraph ids
pars, par_ids = util.remove_invalid_pars(pars, par_ids)
docs = util.transform_ctk(pars, par_ids)  # group paragraphs for each article
# generate at once
docs_tokenized = util.tokenize_documents(docs, tokenizer)

In [None]:
for doc_id, doc in docs_tokenized.items():
    for par_id, par in doc.items():
#         print(par)
        for sent in par:
            print(len(sent), sent)
        print(sum([len(sent) for sent in par]))
        break
    break

In [None]:
lens = [sum([len(sent) for sent in par]) for doc_id, doc in docs_tokenized.items() for par_id, par in doc.items()]

In [None]:
len(lens)

In [None]:
lens_doc = [sum([len(sent) for sent in par for _,par in doc.items()]) for doc_id, doc in docs_tokenized.items()]

In [None]:
len(docs_tokenized)

In [None]:
len(lens_doc)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import seaborn as sns

sns.set_style("white")


plt.figure(figsize=(8,6))
plt.xlabel('# tokens', fontsize=12)
plt.title(f'Distribution by a number of tokens (subwords) in paragraph\nCTK')
plt.hist(
    lens, 25,
    histtype='bar',
    facecolor='black',
    alpha=0.5,
    range=(0, 600),)

plt.savefig("ctk-par-len-dist-600.svg", format="svg")

# plt.legend()
# plt.show()