# Pre-processing and Statistical Overview of CORA Dataset

## Preparations

### Parameters

In [1]:
dataset_path = './data/'

### Libraries

In [2]:
import pandas as pd
import numpy as np
import os
import sys
import _pickle as pickle
import pdb
import re

xml_begin = re.compile('<(\w+)>')
xml_end = re.compile('</(\w+)>')

### Custom Libraries

## Processing

In [3]:
output = []
with open(dataset_path + 'citations', 'r') as fid:
    for f in fid.readlines():
        output.append(f.strip().replace('\t', ','))

In [4]:
output_citations = {}
for element in output:
    if int(element.split(',')[0]) not in output_citations.keys():
        output_citations[int(element.split(',')[0])] = []
    output_citations[int(element.split(',')[0])].append(int(element.split(',')[1]))

In [5]:
output_citations[16]

[76972,
 32529,
 10392,
 213125,
 724,
 105808,
 268098,
 412687,
 18208,
 66517,
 10400,
 718421,
 18211,
 718423,
 8517,
 17,
 19,
 25255,
 25256,
 13070,
 18183,
 8981,
 10424,
 34828]

In [6]:
output_citations

{172005: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 9351: [12, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26],
 116552: [27, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 47, 48],
 36213: [49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  59,
  60,
  61,
  62,
  63,
  6479,
  36213,
  1554,
  36224,
  251443,
  130062,
  19316,
  19318,
  40541,
  3318,
  98264,
  32956,
  3849,
  40549,
  2383,
  5949,
  35265,
  40552,
  251466,
  251467,
  94926,
  10105,
  3026,
  15371,
  6147,
  201247,
  2390,
  1888,
  222460,
  24662,
  59195],
 59444: [64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  79,
  80,
  83,
  84,
  85,
  87,
  88,
  89],
 53462: [90,
  91,
  92,
  93,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  109,
  110,
  111,
  112,
  113,
  258944,
  165924,
  2474,
  8459,
  7687,
  2502,
  12141,
  53462,
  4013,
  5903],
 39403: [114,
  115,
  116,
  117,
  118,
  120,
  121,
  122,
  12

In [7]:
print('number of keys: {}'.format(len(output_citations.keys())))

number of keys: 35788


In [8]:
#TODO: This gives error @shayan. Not sure what you are planning to do
#print('number of unique citeds (values): %d' % len(set(output_citations.values()))) 

In [9]:
classifications = []
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        classifications.append(f.strip().replace('\t', ','))

In [10]:
labels = (set([k.split(',')[1] for k in classifications[:-1]]))

In [11]:
# demoing the values in the classification list, including the classes
classifications[:5]

['http:##www.isi.edu#sims#papers#94-sims-agents.ps,/Information_Retrieval/Retrieval/',
 'http:##www.cis.ohio-state.edu#~ren#tois.ps,/Information_Retrieval/Retrieval/',
 'ftp:##ftp.cs.umass.edu#pub#techrept#techreport#1996#UM-CS-1996-002.ps,/Information_Retrieval/Retrieval/',
 'http:##www.cs.cmu.edu#afs#cs#user#alex#docs#idvl#dl97.ps,/Information_Retrieval/Retrieval/',
 'http:##www.ri.cmu.edu#afs#cs#user#alex#docs#idvl#dl97.ps,/Information_Retrieval/Retrieval/']

In [12]:
# TODO @shayan: This is a bad way to build the map. because we have entries with "," which will lead to unexpected behaviour
# Example:
# ftp:##ftp.eecs.umich.edu#people#fessler#ps#93,isit,hero.ps.Z    /Artificial_Intelligence/Vision_and_Pattern_Recognition/
# You should probably change it for "labels" too above but looks like it is not used anywhere.
pruned_classes_for_ps = {}
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        string_values = (f.strip().replace('\t', ',')).split(',') ### BAD
        if len(string_values) > 1:
            pruned_classes_for_ps[string_values[0]] = string_values[1]
            
pruned_classes_for_ps['ftp:##ftp.eecs.umich.edu#people#fessler#ps#93'] # outputs mscs.ps.Z which is wrong

'mscs.ps.Z'

### Processing xml info

In [13]:
def process_xml_info_in_papers_file(xml_info):
    output = {}
    missing = 'missing'
    tag = missing
    output[tag] = []
    tokens = xml_info.split()
    for token in tokens:
        tag_begin = xml_begin.match(token)
        tag_end = xml_end.match(token)

        if tag_begin:
            tag = tag_begin.group(1)
            output[tag] = []
        elif tag_end:
            assert tag_end.group(1) == tag, 'Bad XML nesting: %s != %s' % (tag_end.group(1), tag)
            tag = missing
        else:
            token = re.sub(r'\W+', '', token)
            token = token.lower()
            if len(token) > 0:
                output[tag].append(token)
    if 'address' in output.keys():
        output['address'] = [''.join(output['address'])]
    return output

In [14]:
def extract_paper_features_from_processed_xml_info(input_information):
    input_information
    features = []

    # preprocessings:
    if 'author' in input_information.keys():
        input_information['author'] = [e for e in input_information['author'] if len(e) > 2]

    input_information['title'] = [e for e in input_information['title'] if len(e) > 3]
    if 'author' in input_information.keys():
        for i in range(max(3, len(input_information['author']))):
            if i >= len(input_information['author']):
                features.append(
                    'author%d=missing' % i
                )
            else:
                features.append(
                    'author{}={}'.format(i, input_information['author'][i])
                )
    else:
        for i in range(3):
            features.append(
                    'author%d=missing' % i
                )


    
    for i in range(max(6, len(input_information['title']))):
        if i >= len(input_information['title']):
            features.append(
                'title%d=missing' % i
            )
        else:
            features.append(
                'title{}={}'.format(i, input_information['title'][i])
            )

    if 'publisher' in input_information.keys():   
        features.append(
            'publisher={}'.format(input_information['publisher'][0])
        )
    else:
        features.append(
            'publisher=missing'
        )

    if 'address' in input_information.keys():   
        features.append(
            'address={}'.format(input_information['address'][0])
        )
    else:
        features.append(
            'address=missing'
        )
    if 'year' in input_information.keys():   
        features.append(
            'year={}'.format(input_information['year'][0])
        )
    else:
        features.append(
            'year=missing'
        ) 
    return features

In [15]:
papers = []
features_for_id = {}
raw_xml_info_for_id = {}
with open(dataset_path + 'papers', 'r') as fid:
    for f in fid.readlines():
        elements = f.strip().split('\t')
        papers.append([int(elements[0]), elements[1]])
        if len(elements) == 3:
            raw_xml_info_for_id[int(elements[0])] = elements[2]
            features_for_id[int(elements[0])] = extract_paper_features_from_processed_xml_info(
                process_xml_info_in_papers_file(elements[2])
            )

In [16]:
# TODO: WHY IS THIS ASSERT FALSE? (according to the paper though, removing duplicates is good enough)
# @shayan. What do you mean here? See papers[:5] and notice that papers actually has duplicates 
# and there haven't been removed yet
len(set([f[0] for f in papers])) == len([f[0] for f in papers])

False

In [17]:
len(classifications[:-1])

30787

In [18]:
papers[1]

[16,
 'http:##www.cs.wisc.edu#~fischer#ftp#pub#tech-reports#ncstrl.uwmadison#CS-TR-90-907#CS-TR-90-907.ps.Z']

In [19]:
features_for_id[16]

['author0=dewitt',
 'author1=futtersack',
 'author2=maier',
 'author3=velez',
 'title0=study',
 'title1=three',
 'title2=alternative',
 'title3=workstationserver',
 'title4=architectures',
 'title5=objectoriented',
 'title6=database',
 'title7=systems',
 'publisher=missing',
 'address=brisbaneaustralia',
 'year=1990']

In [20]:
raw_xml_info_for_id[16]

'[DeWitt90] <author> D. DeWitt, P. Futtersack, D. Maier, F. Velez, </author> <title> "A Study of Three Alternative Workstation-Server Architectures for Object-Oriented Database Systems", </title> <booktitle> Proceedings of the 16th International Conferece on Very Large Data Bases, </booktitle> <address> Brisbane, Australia, </address> <month> August, </month> <year> 1990. </year>'

### Re-structuring

In [21]:
papers[2]

[18, 'ftp:##ftp.cs.purdue.edu#pub#hosking#papers#oopsla93.ps.gz']

In [22]:
output_dataset = {}

In [23]:
postscript_for_id = {}
for element in papers:
    postscript_for_id[element[0]] = element[1]

In [24]:
#TODO: because of the above todo to pruned_classes_for_ps there might be some changes here.
#Even if no code, output definitely changes
label_for_id = postscript_for_id
for id_value in postscript_for_id.keys():
    ps_file = postscript_for_id[id_value]
    if ps_file in pruned_classes_for_ps.keys():
        label_for_id[id_value] = pruned_classes_for_ps[ps_file]
    else:
        label_for_id[id_value] = 'NOLABEL'

In [25]:
edges = output_citations

In [26]:
output_dataset['postscript_for_id'] = postscript_for_id
output_dataset['label_for_id'] = label_for_id
output_dataset['edges'] = edges
output_dataset['features_for_id'] = features_for_id

In [27]:
pickle.dump(output_dataset, open(dataset_path + 'restructured_dataset.pkl', 'wb'))

In [28]:
edges[16]

[76972,
 32529,
 10392,
 213125,
 724,
 105808,
 268098,
 412687,
 18208,
 66517,
 10400,
 718421,
 18211,
 718423,
 8517,
 17,
 19,
 25255,
 25256,
 13070,
 18183,
 8981,
 10424,
 34828]