In [18]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import scipy.stats

TRAIN_PATH = '../data/Patent14K/train.csv'
TEST_PATH = '../data/Patent14K/test.csv' # is not uesd

MODEL_NAME = "anferico/bert-for-patents"

POOLING = 'first_last_avg'
# POOLING = 'last_avg'
# POOLING = 'last2avg'

USE_WHITENING = True
N_COMPONENTS = 384
MAX_LENGTH = 512

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_dataset(path):
    sent_batch = []
    test_data =pd.read_csv(path)
    for sent in test_data['text']:
        sent_batch.append(sent[3:])
    return sent_batch

def build_model(name):
    tokenizer = BertTokenizer.from_pretrained(name)
    model = BertModel.from_pretrained(name)
    model = model.to(DEVICE)
    return tokenizer, model


def sents_to_vecs(sents, tokenizer, model):
    vecs = []
    with torch.no_grad():
        for sent in tqdm(sents):
            inputs = tokenizer(sent, return_tensors="pt", padding=True, truncation=True,  max_length=MAX_LENGTH)
            inputs['input_ids'] = inputs['input_ids'].to(DEVICE)
            inputs['token_type_ids'] = inputs['token_type_ids'].to(DEVICE)
            inputs['attention_mask'] = inputs['attention_mask'].to(DEVICE)

            hidden_states = model(**inputs, return_dict=True, output_hidden_states=True).hidden_states

            if POOLING == 'first_last_avg':
                output_hidden_state = (hidden_states[-1] + hidden_states[1]).mean(dim=1)
            elif POOLING == 'last_avg':
                output_hidden_state = (hidden_states[-1]).mean(dim=1)
            elif POOLING == 'last2avg':
                output_hidden_state = (hidden_states[-1] + hidden_states[-2]).mean(dim=1)
            else:
                raise Exception("unknown pooling {}".format(POOLING))

            vec = output_hidden_state.cpu().numpy()[0]
            vecs.append(vec)
    assert len(sents) == len(vecs)
    vecs = np.array(vecs)
    return vecs


def calc_spearmanr_corr(x, y):
    return scipy.stats.spearmanr(x, y).correlation


def compute_kernel_bias(vecs, n_components):
    """计算kernel和bias
    最后的变换：y = (x + bias).dot(kernel)
    """
    vecs = np.concatenate(vecs, axis=0)
    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(s**0.5))
    W = np.linalg.inv(W.T)
    W = W[:, :n_components]
    return W, -mu


def transform_and_normalize(vecs, kernel, bias):
    """应用变换，然后标准化
    """
    if not (kernel is None or bias is None):
        vecs = (vecs + bias).dot(kernel)
    return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5


def normalize(vecs):
    """标准化
    """
    return vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5


print(f"Configs: {MODEL_NAME}-{POOLING}-{USE_WHITENING}-{N_COMPONENTS}.")

a_sents_train = load_dataset(TRAIN_PATH)
print("Loading {} training samples from {}".format(len(a_sents_train), TRAIN_PATH))


tokenizer, model = build_model(MODEL_NAME)
print("Building {} tokenizer and model successfuly.".format(MODEL_NAME))

print("Transfer sentences to BERT vectors.")

if USE_WHITENING:
    a_vecs_train = sents_to_vecs(a_sents_train, tokenizer, model)

    print("Compute kernel and bias.")
    kernel, bias = compute_kernel_bias([
        a_vecs_train
    ], n_components=N_COMPONENTS)



#     print("Results:")
#     test_sims = (a_vecs_test * b_vecs_test).sum(axis=1)
#     print(u'Spearmanr corr in Testing set：%s' % calc_spearmanr_corr(scores_test, test_sims))



Configs: anferico/bert-for-patents-first_last_avg-True-384.
Loading 14000 training samples from ../data/mini_train_data.csv


Some weights of the model checkpoint at anferico/bert-for-patents were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Building anferico/bert-for-patents tokenizer and model successfuly.
Transfer sentences to BERT vectors.


100%|██████████| 14000/14000 [05:55<00:00, 39.39it/s]


Compute kernel and bias.


In [20]:
kernel.shape,bias.shape

((1024, 384), (1, 1024))

In [22]:
(np.ones((4,1024))+bias).dot(kernel).shape

(4, 384)