## Extract title and abstract

In [1]:
import pandas as pd
import simplejson as json
import sqlite3

from os.path import join

MAG_DIR = '/home/qke100/ke-data/dataset-MAG/'

In [2]:
authorship_df = pd.read_csv('dataset/authorship.csv', dtype=str)
authorship_df.shape

(26378069, 2)

In [3]:
def write_mag_paper_title_abstract(mag_pubids):
    """Return DataFrame"""
    print(len(mag_pubids))
    fout = open('results/mag_title_abstract.json', 'w')
    for dt in ['BookChapter', 'Book', 'Conference', 'Dataset', 'Journal', '', 'Patent', 'Repository']:
        print(dt)
        db_path = join(MAG_DIR, 'paper_title_abs_%s.db' % dt)
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()
        cur.execute('SELECT PaperId, OriginalTitle, abstract FROM paper_title_abs_%s;' % dt)
        for fields in cur:
            pubid = fields[0]
            if pubid in mag_pubids:
                fout.write(json.dumps(fields))
                fout.write('\n')
        cur.close()
        conn.close()
    fout.close()

write_mag_paper_title_abstract(set(authorship_df.MAGPaperID))

16942415
BookChapter
Book
Conference
Dataset
Journal

Patent
Repository


## TFIDF

Run in terminal

### Paper vector

In [None]:
import pandas as pd
import simplejson as json

from scipy.sparse import save_npz
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [None]:
def load_mag_paper_abs():
    """"""
    result = []
    for line in open('results/mag_title_abstract.json'):
        fields = json.loads(line)
        result.append([fields[0], ' '.join(fields[1:])])
    return pd.DataFrame(result, columns=['magpubid', 'tit_abs'])

mag_abs_df = load_mag_paper_abs()
mag_abs_df.shape # (16942415, 2)

In [None]:
pipe = Pipeline([
    ('count', CountVectorizer(stop_words='english', min_df=2)),
    ('tfid', TfidfTransformer())
])

In [None]:
mag_abs_tfidf = pipe.fit_transform(mag_abs_df.tit_abs)
mag_abs_tfidf.shape # (16942415, 2275293)

In [None]:
mag_abs_tfidf
# <16942415x2275293 sparse matrix of type '<class 'numpy.float64'>'
#         with 967345814 stored elements in Compressed Sparse Row format>

In [None]:
mag_abs_df[['magpubid']].to_csv('dataset/paper_tfidf_MAGPaperID.txt', index=False, header=False)

In [None]:
save_npz('dataset/paper_tfidf.npz', mag_abs_tfidf)

### Author vectors

In [None]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, load_npz, save_npz

In [None]:
pid_to_mag_pubids = pd.read_csv(
    'dataset/authorship.csv', dtype=str).groupby('PID')['MAGPaperID'].apply(set).to_dict()
len(pid_to_mag_pubids) # 494455

In [None]:
mag_pubids = [line.strip() for line in open('dataset/paper_tfidf_MAGPaperID.txt')]
len(mag_pubids) # 16942415

In [None]:
mag_abs_tfidf = load_npz('dataset/paper_tfidf.npz')
mag_abs_tfidf.shape # (16942415, 2275293)

In [None]:
def get_author_vector_tfidf():
    """"""
    pubids = frozenset(mag_pubids)
    pubid_to_idx = {p: i for i, p in enumerate(mag_pubids)}
    row, col, data = [], [], []
    row_idx = 0
    pids = []
    for pid in sorted(pid_to_mag_pubids, key=lambda x: int(x)):
        pubs = pubids.intersection(pid_to_mag_pubids[pid])
        if len(pubs) == 0:
            continue
        indices = sorted(pubid_to_idx[e] for e in pubs)
        vec = mag_abs_tfidf[indices, :].mean(axis=0) # shape: (1, )
        x, y = np.where(vec > 0)
        for col_idx in y:
            row.append(row_idx)
            col.append(col_idx)
            data.append(vec[0,col_idx])
        row_idx += 1
        pids.append(pid)
    return pids, csr_matrix((np.array(data), (np.array(row),np.array(col))), 
                            shape=(len(pids),mag_abs_tfidf.shape[1]))

pids, pid_vector_tfidf = get_author_vector_tfidf()

In [None]:
len(pids) # 494455

In [None]:
pid_vector_tfidf
# <494455x2275293 sparse matrix of type '<class 'numpy.float64'>'
#         with 530705946 stored elements in Compressed Sparse Row format>

In [None]:
pd.Series(pids).to_csv('dataset/researcher_tfidf_PID.txt', index=False, header=False)

In [None]:
save_npz('dataset/researcher_tfidf.npz', pid_vector_tfidf)