In [1]:
import bigjson
with open('../data/DBLP_v12/dblp.v12.json', 'rb') as f:
    j = bigjson.load(f)

In [2]:
def inverted_index_to_text(inverted_index, inverted_index_length):
    # Create a dictionary to hold the reconstructed documents
    documents = ['' for _ in range(inverted_index_length)]
    
    # Iterate over each term in the inverted index
    for term, positions in inverted_index.items():
        # print(term, positions)
        for pos in positions:
            documents[pos] = term
    
    return ' '.join(documents)

# Sample inverted index
inverted_index = {
    'hello': [(2)],
    'world': [(3)],
    'foo': [(0)],
    'bar': [(1)]
}

# Convert the inverted index to text
documents = inverted_index_to_text(inverted_index, 4)

# Print the reconstructed documents
print(documents)

foo bar hello world


In [3]:
def fos_to_text(fos):
    # Create a dictionary to hold the reconstructed documents
    strs = []
    
    # Iterate over each term in the inverted index
    for fo in fos:
        strs.append(fo['name'])
    
    return '\n'.join(strs)


In [4]:
from tqdm import tqdm
import pandas as pd
import os
import pickle


papers_text = {}

# with open('../data/DBLP_v12/dblp.v12.json', 'rb') as f:
#     j = bigjson.load(f)
#     for count in tqdm(range(4894081), total=4894081):
#         element = j[count]
#         element_str = element['title']
#         element_str += '\n' + 'Year: ' + str(element['year'])
#         element_str += '\n' + '#Citations: '  + str(element['n_citation'])
#         element_str += '\n' + element['doc_type']
#         element_str += '\n' + element['publisher']
#         if 'indexed_abstract' in element:  
#             element_str += '\n' + inverted_index_to_text(element['indexed_abstract']['InvertedIndex'], element['indexed_abstract']['IndexLength'])
#         if 'fos' in element:
#             element_str += '\n' + fos_to_text(element['fos'])
        
#     # papers_references.to_feather("data/DBLP_v12/papers_references.feather")
# pickle.dump(papers_text, open("../data/DBLP_v12/papers_text.pkl", "wb"))

In [5]:
with open('../data/DBLP_v12/papers_train.feather', 'rb') as f:
    train_papers = pd.read_feather(f)

with open('../data/DBLP_v12/papers_test.feather', 'rb') as f:
    test_papers = pd.read_feather(f)

train_papers = train_papers.rename(columns={'id': 'user_id', 'references': 'item_id'})
test_papers = test_papers.rename(columns={'id': 'user_id', 'references': 'item_id'})
ratings = pd.concat([train_papers, test_papers])

ratings

Unnamed: 0,user_id,item_id,rating
0,1674,1535888970,1
1,1674,1992876689,1
2,1674,1993710814,1
3,1674,2035653341,1
4,1674,2043970887,1
...,...,...,...
13975185,99905544,1562217771,1
13975186,99905544,1625504505,1
13975187,99905544,2015022770,1
13975188,999847014,1978183200,1


In [6]:
user_list = list(ratings['user_id'].drop_duplicates())
user2id = {w: i for i, w in enumerate(user_list)}

item_list = list(ratings['item_id'].drop_duplicates())
item2id = {w: i for i, w in enumerate(item_list)}

In [7]:
# pickle.dump(user2id, open("../data/DBLP_v12/user2id.pkl", "wb"))
# pickle.dump(item2id, open("../data/DBLP_v12/item2id.pkl", "wb"))

In [8]:
papers_references = {}
all_papers = set()
for row in tqdm(ratings.itertuples(), total=len(ratings)):
    user_id = user2id[row.user_id]
    item_id = item2id[row.item_id]
    if user_id not in papers_references:
        papers_references[user_id] = set()
    papers_references[user_id].add(item_id)
    all_papers.add(user_id)


100%|██████████| 42368924/42368924 [01:02<00:00, 673680.48it/s]


In [11]:
import random, heapq

all_papers = list(all_papers)

class ListSet(list):
    """
    Derived from: https://stackoverflow.com/a/15993515
    
    Initialization: 
        Same as with list (basically same as with set):
        (including sets, lists, tuples, 
        generator expressions and range() expressions.)
        
    Setlike behaviors:
        for x in S():
        if x in S:  This is constant time.
        S.add(x)     "    "    "      "
        S.remove(x)  "    "    "      "
        len(s)       "    "    "      "
        print(S), str(S), repr(S)
        copy(S), list(S), set(S)  copy or cast.
        
    Listlike behaviors use internal list positions:
        S[3], S[-1]  # Returns an item.
        S[10:20]     # Returns a list, not ListSet.
        random.choice(S)  This is constant time.
        S += [4, 5, 6]
        S.append(7), same as S.add(7).
        len(s) in constant time.
        x = S.pop()  # Pop last element from internal list.
        y = S.pop(3) # Pop S[3] from internal list.
    """
    def __init__(self, *args, idx_of=None):
        super(ListSet, self).__init__(*args)
        if idx_of:
            self.idx_of = idx_of.copy()
        else:
            self.idx_of = {item: i
                          for (i, item) in enumerate(self)}

    def add(self, item):
        if item not in self.idx_of:
            super(ListSet, self).append(item)
            self.idx_of[item] = len(self) - 1
        
    def append(self, item):
        """
        append and += always append to the internal list,
        but remove and pop from other than the end 
        can change the internal list's order.
        """
        self.add(item)
        
    def copy(self):
        """ Return a shallow copy of the ListSet. """
        return ListSet(self, idx_of=self.idx_of)
    
    def list(self):
        return super(ListSet, self).copy()
        
    def __iadd__(self, items):
        """ self += items """
        for item in items:
            self.add(item)
        return self

    def remove(self, element):
        """
        Remove an element from a set; it must be a member.

        If the element is not a member, raise a KeyError.
        """
        try:
            position = self.idx_of.pop(element)
        except:
            raise(KeyError(element))
            
        last_item = super(ListSet, self).pop()
        if position != len(self):
            self[position] = last_item
            self.idx_of[last_item] = position
            
    def pop(self, i=-1):
        """ Remove by internal list position. """
        item = self[i]
        self.remove(item)
        return item
    
    def __contains__(self, item):
        return item in self.idx_of
    
    def _str_body(self):
        return ", ".join(repr(item) for item in self)
    
    def __repr__(self):
        return "ListSet([" + self._str_body() + "])"
    
    def __str__(self):
        if self:
            return "{" + self._str_body() + "}"
        else:
            return "ListSet()"

all_papers = ListSet(all_papers)

In [12]:
type(random.sample(all_papers, 100))

list

In [13]:
min_len = 54
repeat_cases = []
paper_negatives = {}
for paper in tqdm(all_papers, total=len(all_papers)):
    random_papers = set(random.sample(all_papers, 100))
    negatives = random_papers - papers_references[paper]
    if len(negatives) < min_len:
        repeat_cases.append(paper)
    else:
        negatives = random.sample(negatives, min_len)
    if paper not in paper_negatives:
        paper_negatives[paper] = []
    paper_negatives[paper].extend(negatives)
    

100%|██████████| 2794154/2794154 [03:32<00:00, 13122.41it/s]


In [46]:
train_papers

Unnamed: 0,user_id,item_id,rating
0,1674,1535888970,1
1,1674,1992876689,1
2,1674,1993710814,1
3,1674,2035653341,1
4,1674,2043970887,1
...,...,...,...
28393729,99905544,-2134741067,1
28393730,999847014,1940933138,1
28393731,999847014,1975620021,1
28393732,999847014,2097595927,1


In [14]:
users, items, ratings = [], [], []
for row in tqdm(train_papers.itertuples(), total=train_papers.shape[0]):
    user_id = user2id[row.user_id]
    item_id = item2id[row.item_id]
    users.append(int(user_id))
    items.append(int(item_id))
    ratings.append(int(row.rating))
    for i in paper_negatives[user_id][:4]:
        users.append(int(user_id))
        items.append(int(i))
        ratings.append(int(0))
pickle.dump(users, open(f'../data/DBLP_v12/train_users_4.pkl', 'wb'))
pickle.dump(items, open(f'../data/DBLP_v12/train_items_4.pkl', 'wb'))
pickle.dump(ratings, open(f'../data/DBLP_v12/train_ratings_4.pkl', 'wb'))

100%|██████████| 28393734/28393734 [01:06<00:00, 427685.84it/s]


In [15]:
len(users), len(items), len(ratings)

(141968670, 141968670, 141968670)

In [16]:
users, items, ratings = [], [], []
for row in tqdm(test_papers.itertuples(), total=test_papers.shape[0]):
    user_id = user2id[row.user_id]
    item_id = item2id[row.item_id]
    users.append(int(user_id))
    items.append(int(item_id))
    ratings.append(int(row.rating))
    for i in paper_negatives[user_id][4:]:
        users.append(int(user_id))
        items.append(int(i))
        ratings.append(int(0))

pickle.dump(users, open(f'../data/DBLP_v12/test_users_50.pkl', 'wb'))
pickle.dump(items, open(f'../data/DBLP_v12/test_items_50.pkl', 'wb'))
pickle.dump(ratings, open(f'../data/DBLP_v12/test_ratings_50.pkl', 'wb'))

100%|██████████| 13975190/13975190 [02:55<00:00, 79751.40it/s]


In [17]:
len(users), len(items), len(ratings)

(712734690, 712734690, 712734690)