In [None]:
import pathlib
import numpy as np
import scipy.sparse
import scipy.io
import os
import sys
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
# import utils.preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stopwords
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from collections import Counter
nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
dataset = pickle.load(open('raw/AMiner/dblp_preprocessed_dataset.pkl', 'rb'))
train_test_idx = pickle.load(open('raw/AMiner/Train_Test_indices.pkl', 'rb'))
# dataset = pickle.load(open('raw/AMiner/dblp_preprocessed_dataset_V2.2.pkl', 'rb'))
# train_test_idx = pickle.load(open('raw/AMiner/Train_Test_indices_V2.2.pkl', 'rb'))
docID_venue = pickle.load(open('raw/AMiner/documentID_venue.pkl', 'rb'))

In [None]:
author_label = pd.DataFrame(columns=['author_id', 'label', 'author_name'])
paper_author = pd.DataFrame(columns=['paper_id', 'author_id'])
paper_conf = pd.DataFrame(columns=['paper_id', 'conf_id'])
paper_term = pd.DataFrame(columns=['paper_id', 'term_id'])
papers = pd.DataFrame(columns=['paper_id', 'paper_title'])
terms = pd.DataFrame(columns=['term_id', 'term'])
confs = pd.DataFrame(columns=['conf_id', 'conf'])

In [None]:
# Cleaning venue names
publication_list = ['sigmod', 'vldb', 'icde', 'icdt', 'edbt', 'pods', 'kdd', 'www',
                      'sdm', 'pkdd', 'icdm', 'cikm', 'aaai', 'icml', 'ecml', 'colt',
                      'uai', 'soda', 'focs', 'stoc', 'stacs']

for i, record in enumerate(docID_venue):
    venue = record[1]
    for pub in publication_list:
        if pub in venue.lower():
            docID_venue[i][1] = pub

In [None]:
authors_counter = Counter()
terms_counter = Counter()
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]
    terms_counter.update(skillIdx)
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)
    
    for authorId in authorIdx:
        paper_author = paper_author.append({'paper_id': paper_id, 'author_id': authorId}, ignore_index=True)
    
    for skillId in skillIdx:
        paper_term = paper_term.append({'paper_id': paper_id, 'term_id': skillId}, ignore_index=True)
        
    papers = papers.append({'paper_id': paper_id, 'paper_title': 'na'}, ignore_index=True)

unique_authors_idx = list(authors_counter.keys())
for unique_authors_id in unique_authors_idx:
    author_label = author_label.append({'author_id': unique_authors_id, 'label': -1, 'author_name': 'na'}, ignore_index=True)
    
unique_terms_idx = list(terms_counter.keys())
for unique_terms_id in unique_terms_idx:
    terms = terms.append({'term_id': unique_terms_id, 'term': 'na'}, ignore_index=True)
    
conf_counter = Counter()
for record in docID_venue:
    paper_id = record[0]
    conf_counter.update([record[1]])
venues = list(conf_counter.keys())

conf_confID = {}
for i, venue in enumerate(venues):
    confs = confs.append({'conf_id': i, 'conf': venue}, ignore_index=True)
    conf_confID.update({venue: i})
    
for record in docID_venue:
    paper_id = record[0]
    conf_id = conf_confID[record[1]]
    paper_conf = paper_conf.append({'paper_id': paper_id, 'conf_id': conf_id}, ignore_index=True)

In [None]:
authors_counter = Counter()
terms_counter = Counter()
author_size_counter = Counter()
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]
    terms_counter.update(skillIdx)
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)
    author_size_counter.update([record[2].todense().shape[1]])

In [None]:
print(len(unique_authors_idx))
print(author_size_counter)

In [None]:
# author_label = pd.read_csv('raw/AMiner/author_label.txt', sep='\t', header=None, names=['author_id', 'label', 'author_name'], keep_default_na=False, encoding='utf-8')
# paper_author = pd.read_csv('raw/AMiner/paper_author.txt', sep='\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')
# paper_conf = pd.read_csv('raw/AMiner/paper_conf.txt', sep='\t', header=None, names=['paper_id', 'conf_id'], keep_default_na=False, encoding='utf-8')
# paper_term = pd.read_csv('raw/AMiner/paper_term.txt', sep='\t', header=None, names=['paper_id', 'term_id'], keep_default_na=False, encoding='utf-8')
# papers = pd.read_csv('raw/AMiner/paper.txt', sep='\t', header=None, names=['paper_id', 'paper_title'], keep_default_na=False, encoding='cp1252')
# terms = pd.read_csv('raw/AMiner/term.txt', sep='\t', header=None, names=['term_id', 'term'], keep_default_na=False, encoding='utf-8')
# confs = pd.read_csv('raw/AMiner/conf.txt', sep='\t', header=None, names=['conf_id', 'conf'], keep_default_na=False, encoding='utf-8')

In [None]:
authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
print('Number of papers :', len(valid_papers))

papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
print('Number of papers :', len(paper_conf))

paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)


author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

print('Number of conferences ', len(confs))
print('Number of authors ', len(author_label))
print('Number of terms ', len(terms))
print('Number of papers ', len(papers))

authors_list = list(author_label['author_id'])
papers_list = list(papers['paper_id'])
term_list = list(terms['term_id'])
conf_list = list(confs['conf_id'])
dim = len(authors_list) + len(papers_list) + len(term_list) + len(confs)
print(' Total entities :: ', dim)


author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}


entity_id_map = pd.DataFrame(
    columns=['domain', 'entity_id','serial_id']
)
type_dict = { 'author': author_id_mapping, 'paper': paper_id_mapping, 'term': term_id_mapping, 'conf': conf_id_mapping }
for _type,_dict in type_dict.items():
    i = list(_dict.keys())
    j = list(_dict.values())
    _df = pd.DataFrame( data = {'entity_id': i ,'serial_id': j } )
    _df['domain'] = _type
    entity_id_map = entity_id_map.append(_df, ignore_index=True)

    
# ======================================================
# Save data
# ======================================================
data_save_path = 'processed_data/'
if not os.path.exists('processed_data'):
    os.mkdir('processed_data')
if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)
entity_id_map.to_csv( os.path.join( data_save_path, 'entity_id_mapping.csv') ) 

# Create graph data
nodes_author_df = pd.DataFrame( data = { 'author' : list(author_id_mapping.values()) })
nodes_paper_df = pd.DataFrame(  data = { 'paper' : list(paper_id_mapping.values()) } )
nodes_term_df = pd.DataFrame( data = { 'term' : list(term_id_mapping.values()) } )
nodes_conf_df = pd.DataFrame(  data = { 'conf' : list(conf_id_mapping.values()) } )

nodes_author_df.to_csv(os.path.join(data_save_path,'nodes_author.csv'),index = False)
nodes_paper_df.to_csv(os.path.join(data_save_path,'nodes_paper.csv'),index = False)
nodes_term_df.to_csv(os.path.join(data_save_path,'nodes_term.csv'),index = False)
nodes_conf_df.to_csv(os.path.join(data_save_path,'nodes_conf.csv'),index = False)

PA_edge_list = []
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    PA_edge_list.append((idx1,idx2))
    
df = pd.DataFrame ( data =  np.array(PA_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PA_edges.csv')
df.to_csv(fpath, index=False)
    
PT_edge_list = []
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    PT_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data =  np.array(PT_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PT_edges.csv')
df.to_csv(fpath, index=False)
    

PC_edge_list = []
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    PC_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data = np.array(PC_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PC_edges.csv')
df.to_csv(fpath, index=False)

In [None]:
ap_counter = Counter()
tp_counter = Counter()
pc_counter = Counter()

for i, _ in PA_edge_list:
    ap_counter.update([i])
print(np.mean(list(ap_counter.values())))

for i, _ in PT_edge_list:
    tp_counter.update([i])
print(np.mean(list(tp_counter.values())))

for _, i in PC_edge_list:
    pc_counter.update([i])
print(np.mean(list(pc_counter.values())))

In [None]:
# ==============================
# Create data for HIN2Vec
# ==============================

df = pd.DataFrame(columns=['node1', 'node2','rel'])
for edge in PA_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 0},ignore_index=True )

for edge in PT_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 1},ignore_index=True )
    
for edge in PC_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 2},ignore_index=True )
  

In [None]:
df['node1'] = df['node1'].astype(int)
df['node2'] = df['node2'].astype(int)
df['rel'] = df['rel'].astype(int)
fpath = os.path.join(data_save_path,'hin2vec_dblp_input.txt')
df.to_csv( fpath, index = None, sep=',')

In [None]:
df

In [None]:
entity_id_map.head(5)

In [None]:
entity_id_map.loc[14000]