In [15]:
import codecs
from utils.general import *
from utils.data_helper import *
from utils.task_helper import *
import pickle
import time
import os
from datetime import datetime  
import random
import numpy as np
import math
from data_preparation.user import *

## Prepare paper related files

In [4]:
InFile_dir = 'data_folder/raw'
OutFile_dir = 'data_folder/my'
Path_PaperTitleAbs_bySentence = os.path.join(InFile_dir, 'PaperTitleAbs_bySentence.txt')
Path_PaperFeature = os.path.join(OutFile_dir, 'paper_feature.txt')
max_word_size_per_paper = 15
 
create_dir(OutFile_dir)

word2idx = {}
entity2idx = {}
relation2idx = {}
word2idx, entity2idx = gen_paper_content(
    Path_PaperTitleAbs_bySentence, Path_PaperFeature, word2idx, entity2idx, field=["Title"], doc_len=max_word_size_per_paper
)


loading file PaperTitleAbs_bySentence.txt...
loading line: 880000, time elapses: 10.4s  
parsing into feature file  ...
parsed paper count: 110000, time elapses: 0.5s 


In [5]:
word2idx_filename = os.path.join(OutFile_dir, 'word2idx.pkl')
entity2idx_filename = os.path.join(OutFile_dir, 'entity2idx.pkl')

Path_RelatedFieldOfStudy = os.path.join(InFile_dir, 'RelatedFieldOfStudy.txt')
OutFile_dir_KG = os.path.join(OutFile_dir, 'KG')
create_dir(OutFile_dir_KG)

gen_knowledge_relations(Path_RelatedFieldOfStudy, OutFile_dir_KG, entity2idx, relation2idx) 

processing file RelatedFieldOfStudy.txt... done.


In [6]:
Path_SentenceCollection = os.path.join(OutFile_dir, 'sentence.txt')
gen_sentence_collection(
    Path_PaperTitleAbs_bySentence,
    Path_SentenceCollection,
    word2idx
)

with open(word2idx_filename, 'wb') as f:
    pickle.dump(word2idx, f)
dump_dict_as_txt(word2idx, os.path.join(OutFile_dir, 'word2id.tsv'))
with open(entity2idx_filename, 'wb') as f:
    pickle.dump(entity2idx, f)

loading file PaperTitleAbs_bySentence.txt...
loading line: 880000, time elapses: 9.8s 

## Prepare user related files

In [9]:

_t0 = time.time()

Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')
Path_PaperAuthorAffiliations = os.path.join(InFile_dir, 'PaperAuthorAffiliations.txt')
Path_Papers = os.path.join(InFile_dir, 'Papers.txt')
Path_Author2ReferencePapers = os.path.join(OutFile_dir, 'Author2ReferencePapers.tsv')

author2paper_list = load_author_paperlist(Path_PaperAuthorAffiliations)
paper2date = load_paper_date(Path_Papers)
paper2reference_list = load_paper_reference(Path_PaperReference)

author2reference_list = get_author_reference_list(author2paper_list, paper2reference_list, paper2date)

output_author2reference_list(
    author2reference_list,
    Path_Author2ReferencePapers
)

OutFile_dir_DKN = os.path.join(OutFile_dir, 'DKN-training-folder')
create_dir(OutFile_dir_KG)


loading PaperAuthorAffiliations.txt...
loading Papers.txt...
loading PaperReferences.txt...
parsing user's reference list ...
parsed user count: 430000, time elapses: 4.0s 
outputing author reference list


In [14]:
gen_experiment_splits(
    Path_Author2ReferencePapers,
    OutFile_dir_DKN,
    Path_PaperFeature,
    item_ratio=0.1,
    tag='small'
)
gen_experiment_splits(
    Path_Author2ReferencePapers,
    OutFile_dir_DKN,
    Path_PaperFeature,
    item_ratio=1.0,
    tag='full'
) 


_t1 = time.time()
print('time elapses for user is : {0:.1f}s'.format(_t1 - _t0))

expanding user behaviors...
processing user number : 287000, time elapses: 1.7s done. Sample number in train / valid / test is 146426 / 7894 / 7894
Negative sampling for train...
Negative sampling for validation...
Negative sampling for test...
done.
expanding user behaviors...
processing user number : 287000, time elapses: 8.6s done. Sample number in train / valid / test is 1782333 / 125010 / 125010
Negative sampling for train...
Negative sampling for validation...
Negative sampling for test...
done.
time elapses for user is : 11422.0s


### item2item 

In [18]:
OutFile_dir_item2item = r'data_folder/my/item2item'
create_dir(OutFile_dir_item2item)
Path_PaperFeature
item_set = load_has_feature_items(Path_PaperFeature)


Path_PaperReference = os.path.join(InFile_dir, 'PaperReferences.txt')
pair2CocitedCnt, pair2CoReferenceCnt = gen_paper_cocitation(Path_PaperReference)

Path_paper_pair_cocitation = os.path.join(OutFile_dir_item2item, 'paper_pair_cocitation_cnt.csv')
Path_paper_pair_coreference = os.path.join(OutFile_dir_item2item, 'paper_pair_coreference_cnt.csv')

with open(Path_paper_pair_cocitation, 'w') as wt:
    for p, v in pair2CocitedCnt.items():
        if p[0] in item_set and p[1] in item_set:
            wt.write('{0},{1},{2}\n'.format(p[0], p[1], v))

with open(Path_paper_pair_coreference, 'w') as wt:
    for p, v in pair2CoReferenceCnt.items():
        if p[0] in item_set and p[1] in item_set:
            wt.write('{0},{1},{2}\n'.format(p[0], p[1], v))



loading PaperReferences.txt...
process paper num 53400 / 53452...time elapses: 8.7s	Done.
process paper num 73600 / 73699...time elapses: 48.4s	Done.


In [19]:

Path_Papers = os.path.join(InFile_dir, 'Papers.txt')
Path_PaperAuthorAffiliations = os.path.join(InFile_dir, 'PaperAuthorAffiliations.txt')
paper2date = load_paper_date(Path_Papers)
author2paper_list, paper2author_set = load_paper_author_relation(Path_PaperAuthorAffiliations)
Path_FirstAuthorPaperPair = os.path.join(OutFile_dir_item2item, 'paper_pair_cofirstauthor.csv')
first_author_pairs = gen_paper_pairs_from_same_author(
    author2paper_list, paper2author_set, paper2date, Path_FirstAuthorPaperPair, item_set
)



loading Papers.txt...
loading PaperAuthorAffiliations.txt...
process author num 435800 / 435822...time elapses: 1.0s

In [24]:

split_train_valid_file(
    [Path_paper_pair_cocitation, Path_FirstAuthorPaperPair, Path_paper_pair_coreference],
    OutFile_dir_DKN
)
gen_negative_instances(
    item_set,
    os.path.join(OutFile_dir_DKN, 'item2item_train.txt'),
    os.path.join(OutFile_dir_DKN, 'item2item_train_instances.txt'),
    9
)
gen_negative_instances(
    item_set,
    os.path.join(OutFile_dir_DKN, 'item2item_valid.txt'),
    os.path.join(OutFile_dir_DKN, 'item2item_valid_instances.txt'),
    9
)


negative sampling for file item2item_train.txt...
process line num 182400 / 182410...time elapses: 3.5s	done.
negative sampling for file item2item_valid.txt...
process line num 45700 / 45740...time elapses: 0.9s	done.
