In [1]:
from nlp import load_dataset
dataset = load_dataset('wikipedia', '20200501.en')
dataset = dataset["train"]

In [2]:
import logging
import math
import os
from dataclasses import dataclass, field
from typing import Optional

from transformers import (
    CONFIG_MAPPING,
    MODEL_WITH_LM_HEAD_MAPPING,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    HfArgumentParser,
    LineByLineTextDataset,
    PreTrainedTokenizer,
    TextDataset,
    Trainer,
    TrainingArguments,
    set_seed,
)

In [3]:
config = AutoConfig.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelWithLMHead.from_pretrained("gpt2")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [5]:
dataset[3]

{'title': 'Kalitta',
 'text': 'Kalitta may refer to:\n\nConnie Kalitta (born 1938), a retired American drag racer and CEO of the eponymous Kallita Air.\nDoug Kalitta (born 1964), an American drag racer, nephew of Connie Kalitta and owner of Kalitta Charters.\nScott Kalitta (1962-2008), an American drag racer and son of Connie Kalitta.\nKalitta Air, a cargo airline flying Boeing 747 aircraft.\nKalitta Charters, a cargo airline flying medium-sized aircraft.'}

In [6]:
query = "Salesforce is a tech company with headquarter in California"

In [7]:
# from tqdm import tqdm
# data_collections = []
# for data in tqdm(dataset):
#     if "salesforce" in data["text"].lower():
#         data_collections.append(data)

In [8]:
import torch
# torch.save(data_collections, "/export/home/Experiments/20200701/data_collections.tmp")
data_collections = torch.load("/export/home/Experiments/20200701/data_collections.tmp")

In [9]:
len(data_collections)

686

In [10]:
# Why did it fail when feeding text directly?
# dataset = dataset.map(lambda entry, index: tokenize_to_features(entry, index), with_indices=True)

In [11]:
from typing import List, Dict
block_size = tokenizer.max_len - tokenizer.num_special_tokens_to_add(pair=False)


class LazyTextDataset(torch.utils.data.dataset.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i) -> torch.Tensor:
        instance = self.dataset[i]
        example = tokenize_to_features(instance)["examples"]
        return torch.tensor(example, dtype=torch.long)



def tokenize_to_features(entry: Dict[str, str]) -> List[int]:
    examples = []
    tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(entry["text"]))
    # Truncate in block of block_size
    for i in range(0, len(tokenized_text) - block_size + 1, block_size):
        examples.append(
            tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
        )
    
    try:
        remaining_text = tokenized_text[i + block_size:]
    except UnboundLocalError:
        if len(tokenized_text) >= block_size:
            raise ValueError
        remaining_text = tokenized_text

    num_tokens_to_pad = block_size - len(remaining_text)
    remaining_padded_text = remaining_text + [
        tokenizer.eos_token_id for _ in range(num_tokens_to_pad)]
    examples.append(remaining_padded_text)

    # print(len(tokenized_text[i + block_size: ]))
    return {"examples": examples, "text": entry["text"], "title": entry["title"]}


def create_data_loader(dataset, batch_size, collate_fn):
    batch = []
    cumulative_batch_size = 0
    for i in range(len(dataset)):
        batch.extend([
            x.squeeze(dim=0)
            for x in dataset[i].split(1, dim=0)
        ])
        cumulative_batch_size += dataset[i].shape[0]
        if cumulative_batch_size >= batch_size:
            if batch_size == 1:
                for sub_batch in batch:
                    yield collate_fn([sub_batch])
            else:
                yield collate_fn(batch)
            batch = []
            cumulative_batch_size = 0
            
            
def clip_gradient_norm_(gradients, max_norm: float = 1.0):
    if max_norm is None:
        return gradients

    total_norm = torch.norm(torch.stack(
        [torch.norm(grad, 2) for grad in gradients]), 2)
    clip_coef = max_norm / (total_norm + 1e-6)

    for grad in gradients:
        grad.detach().mul_(clip_coef)

In [12]:
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler, Sampler, SequentialSampler
tokenizer._pad_token = tokenizer.eos_token
wrapped_dataset = LazyTextDataset(dataset)
sampler = SequentialSampler(wrapped_dataset)
wrapped_relevant_data = LazyTextDataset(data_collections)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)
relevant_data_loader = create_data_loader(
    wrapped_relevant_data,
    batch_size=1,
    collate_fn=data_collator)
# data_loader = create_data_loader(
#     wrapped_dataset,
#     batch_size=32,
#     collate_fn=data_collator)
# torch.tensor(tokenize_to_features(wrapped_dataset.dataset[133353])["examples"]).shape

In [13]:
test_input = tokenize_to_features({
    "text": query,
    "title": None})

test_input = data_collator([torch.tensor(test_input["examples"], dtype=torch.long)])
test_input["input_ids"] = test_input["input_ids"].view(-1, 1024)
test_input["labels"] = test_input["labels"].view(-1, 1024)

In [14]:
test_input

{'input_ids': tensor([[44490,  3174,   318,  ..., 50256, 50256, 50256]]),
 'labels': tensor([[44490,  3174,   318,  ...,  -100,  -100,  -100]])}

In [15]:
model.cuda();

In [16]:
from influence_utils.influence import *

In [17]:
# influences = compute_influences(
#     n_gpu=1,
#     device=torch.device("cuda"),
#     model=model,
#     test_inputs=test_input,
#     batch_train_data_loader=create_data_loader(
#         wrapped_dataset,
#         batch_size=1,
#         collate_fn=data_collator),
#     instance_train_data_loader=create_data_loader(
#         wrapped_dataset,
#         batch_size=1,
#         collate_fn=data_collator)
# )

In [18]:
params_filter = None
weight_decay_ignores = None
weight_decay = None
weight_decay_ignores = None
if params_filter is None:
    params_filter = [
        "bert.pooler.dense.weight",
        "bert.pooler.dense.bias"]

if weight_decay_ignores is None:
    weight_decay_ignores = [
        "bias",
        "LayerNorm.weight"]

data_loader = create_data_loader(
        wrapped_relevant_data,
        batch_size=1,
        collate_fn=data_collator)
# cache_file = os.path.join(cache_dir, "s_test.cached.pkl")
# if not os.path.exists(cache_file):
s_test = compute_s_test(
    n_gpu=1,
    device=torch.device("cuda"),
    model=model,
    test_inputs=test_input,
    train_data_loaders=[data_loader],
    params_filter=params_filter,
    weight_decay=weight_decay,
    weight_decay_ignores=weight_decay_ignores,
    scale=10000,
    num_samples=None)

62.77 | 0.00: : 1581it [10:14,  2.57it/s]


In [19]:
torch.save(s_test, "/export/home/Experiments/20200701/s_test.relevant.tmp")
# s_test = torch.load("/export/home/Experiments/20200701/s_test.relevant.tmp")

In [20]:
influences = []
train_inputs_collections = []
for train_inputs in tqdm(relevant_data_loader):
    grad_z = compute_gradients(
        n_gpu=1,
        device=torch.device("cuda"),
        model=model,
        inputs=train_inputs,
        params_filter=params_filter,
        weight_decay=weight_decay,
        weight_decay_ignores=weight_decay_ignores)
    
    # experimental_clip_gradient_norm_(grad_z, max_norm=1.0)
    with torch.no_grad():
        raise ValueError("Negative sign probably missing?")
        influence = [
            torch.sum(x * y)
            for x, y in zip(grad_z, s_test)]

    influences.append(sum(influence).item())
    train_inputs_collections.append(train_inputs)

1581it [03:18,  7.96it/s]


In [21]:
import numpy as np

In [22]:
np.argsort(influences)[-10:]

array([ 600,  215, 1376, 1100,   67,  132,  673,   62,  848, 1501])

In [25]:
print(tokenizer.decode(train_inputs_collections[848]["input_ids"].cpu().detach().numpy().squeeze()))

An application service provider (ASP) is a business providing computer-based services to customers over a network; such as access to a particular software application (such as customer relationship management) using a standard protocol (such as HTTP).

The need for ASPs has evolved from the increasing costs of specialized software that have far exceeded the price range of small to medium-sized businesses. As well, the growing complexities of software have led to huge costs in distributing the software to end-users. Through ASPs, the complexities and costs of such software can be cut down. In addition, the issues of upgrading have been eliminated from the end-firm by placing the onus on the ASP to maintain up-to-date services, 24 x 7 technical support, physical and electronic security and in-built support for business continuity and flexible working.

The importance of this marketplace is reflected by its size., estimates of the United States market ranged from 1.5 to 4 billion dollars.

In [None]:
Salesforce is a tech company with headquarter in California

In [61]:
ds = []
for i, d in enumerate(data_collections):
    if ("salesforce" in d["text"].lower() and "california" in d["text"].lower() and "headquarter" in d["text"].lower() and "tech" in d["text"].lower()
        and "company" in d["text"].lower()):
        ds.append((i, d))
    
    if "salesforce" in d["title"].lower():
        print(i)

79
137
315
317
398
403
595


In [70]:
np.argsort(influences)[403]

352

In [67]:
[i for i in range(len(ds)) if "salesforce" in ds[i][1]["title"].lower()]

[49, 50]