## Paper [SPECTER](https://github.com/allenai/specter) vector

### Papers to get their vectors

In [None]:
import pandas as pd
import simplejson as json

def write_todo():
    """"""
    authorship_df = pd.read_csv('dataset/authorship.csv', dtype=str, columns=['MAGPaperID'])
    print(authorship_df.shape)
    mag_paper_specter = pd.read_parquet('results/paper_specter.parquet', columns=['ids'])
    mag_paper_specter['ids'] = mag_paper_specter.ids.astype(str)
    print(mag_paper_specter.shape)
    todo = set(authorship_df['MAGPaperID']) - set(mag_paper_specter['ids'])
    print(len(todo))
    fout = open('specter_todo.json', 'w')
    for line in open('results/mag_title_abstract.json'):
        pubid, title, abstract = json.loads(line)
        if pubid in todo:
            fout.write(line)
    fout.close()

write_todo()

In [None]:
import os
import pickle
import simplejson as json

from transformers import AutoTokenizer, AutoModel

def embed_batch(tokenizer, model, pickler, pubids, batch):
    """"""
    inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
    result = model(**inputs)
    vec = result.last_hidden_state[:, 0, :].detach().cpu().numpy()
    for a, b in zip(pubids, vec):
        pickler.dump([a, b])

def embed_chunk(out_path, already):
    """"""
    fout = open(out_path, 'ab')
    pickler = pickle.Pickler(fout)
    tokenizer = AutoTokenizer.from_pretrained('allenai/specter')
    model = AutoModel.from_pretrained('allenai/specter')
    pubids, batch = [], []
    cnt = 0
    for line in open('specter_todo.json'):
        pubid, title, abstract = json.loads(line)
        if pubid in already:
            continue
        pubids.append(pubid)
        batch.append(title + ' ' + abstract)
        if len(pubids) == 8:
            embed_batch(tokenizer, model, pickler, pubids, batch)
            pubids, batch = [], []
            cnt += 8
        if cnt % 1000 == 0:
            print(cnt, end='')
            print('\r', end='')
    if len(pubids) > 0:
        embed_batch(tokenizer, model, pickler, pubids, batch)
        cnt += len(pubids)
    return cnt

def embed_all():
    """"""
    out_path = 'results/paper_specter_2.pkl'
    already = set()
    if os.path.exists(out_path):
        fin = open(out_path, 'rb')
        unpickler = pickle.Unpickler(fin) 
        while True:
            try:
                data = unpickler.load()
            except EOFError:
                break
            already.add(data[0])
        fin.close()
    print(len(already))
    cnt = embed_chunk(out_path, already)

embed_all()

### Split the files

In [None]:
import numpy as np
import os
import pandas as pd
import pickle

from os.path import isfile, join

In [None]:
def load_paper_specter():
    """"""
    results = []
    in_paths = [f for f in os.listdir('results/') if isfile(join('results/', f)) and 'paper_specter' in f]
    for in_path in in_paths:
        print(in_path)
        in_path = join('results/', in_path)
        if in_path.endswith('.parquet'):
            df = pd.read_parquet(in_path)
            df['ids'] = df.ids.astype(str)
            results.append(df)
        elif in_path.endswith('.pkl'):
            rows = []
            fin = open(in_path, 'rb')
            unpickler = pickle.Unpickler(fin)
            while True:
                try:
                    pub_id, vec = unpickler.load()
                    rows.append([vec, pub_id])
                except EOFError:
                    break
            fin.close()
            results.append(pd.DataFrame(rows, columns=['embedding', 'ids']))
    return pd.concat(results, ignore_index=True)

In [None]:
def split_paper_specter():
    """"""
    mag_paper_specter = load_paper_specter()
    print(mag_paper_specter.shape)
    mag_pub_ids = set(pd.read_csv('dataset/authorship.csv', dtype=str)['MAGPaperID'])
    print(len(mag_pub_ids))
    cnt = 0
    for idx, df_i in enumerate(np.array_split(mag_paper_specter, 10)):
        fout = open('dataset/paper_specter_%d.pkl' % idx, 'wb')
        pickler = pickle.Pickler(fout)
        for pub_id, vec in zip(*[df_i[c] for c in ['ids', 'embedding']]):
            if pub_id in mag_pub_ids:
                pickler.dump([pub_id, vec])
                cnt += 1
        fout.close()
    print(cnt) # 16942415

split_paper_specter()

## Author SPECTER vector

In [None]:
import numpy as np
import os
import pandas as pd
import pickle

from os.path import isfile, join

def load_paper_specter():
    """"""
    rows = []
    in_paths = sorted(f for f in os.listdir('dataset/') if isfile(join('dataset/', f)) and 'paper_specter_' in f)
    for in_path in in_paths:
        print(in_path)
        in_path = join('dataset/', in_path)
        fin = open(in_path, 'rb')
        unpickler = pickle.Unpickler(fin)
        while True:
            try:
                rows.append(unpickler.load())
            except EOFError:
                break
        fin.close()
    return pd.DataFrame(rows, columns=['MAGPaperID', 'SpecterVector'])

def write_author_vector_specter():
    """"""
    mag_paper_specter = load_paper_specter()
    print(mag_paper_specter.shape)
    pubid_to_idx = {p: i for i, p in enumerate(mag_paper_specter.MAGPaperID)}
    print(len(pubid_to_idx))
    pid_to_mag_pubids = pd.read_csv(
        'dataset/authorship.csv', dtype=str).groupby('PID')['MAGPaperID'].apply(set).to_dict()
    print(len(pid_to_mag_pubids))
    fout = open('dataset/researcher_specter.pkl', 'wb')
    pickler = pickle.Pickler(fout)
    cnt = 0
    for pid in sorted(pid_to_mag_pubids, key=lambda x: int(x)):
        pubs = pid_to_mag_pubids[pid]
        indices = sorted(pubid_to_idx[p] for p in pubs)
        vec = np.vstack(mag_paper_specter.iloc[indices].SpecterVector.values).mean(axis=0)
        pickler.dump([pid, vec])
        cnt += 1
    fout.close()
    print(cnt) # 494455

write_author_vector_specter()