# Pre-processing and Statistical Overview of CORA Dataset

## Preparations

### Parameters

In [None]:
dataset_path = './data/'

### Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import _pickle as pickle
import pdb
import re

xml_begin = re.compile('<(\w+)>')
xml_end = re.compile('</(\w+)>')

### Custom Libraries

## Processing

### Building a table of output citations

In [None]:
output = []
with open(dataset_path + 'citations', 'r') as fid:
    for f in fid.readlines():
        output.append(f.strip().replace('\t', ','))

In [None]:
output_citations = {}
with open(dataset_path + 'citations', 'r') as fid:
    for f in fid.readlines():
        current_line = f.strip().split()
        current_line = [int(current_line[0]),int(current_line[1])]
        if current_line[0] not in output_citations:
            output_citations[current_line[0]] = []
        output_citations[current_line[0]].append(current_line[1])

To get IDs of all the documents cited by document id 16 you run the following command

In [None]:
output_citations[16] 

In [None]:
print('number of keys: {}'.format(len(output_citations.keys())))

### Building a table of ps -> class

In [None]:
classifications = {}

with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        current_line = f.strip().split()
        if len(current_line) == 0:
            continue
        if current_line[0] == "keywords":
            continue
        classifications[current_line[0]] = current_line[1]

In [None]:
labels = set(classifications.values())
print(labels)
print(classifications['http:##www.isi.edu#sims#papers#94-sims-agents.ps'])

### Functions to processing xml info

In [None]:
def process_xml_info_in_papers_file(xml_info):
    output = {}
    missing = 'missing'
    tag = missing
    output[tag] = []
    tokens = xml_info.split()
    for token in tokens:
        tag_begin = xml_begin.match(token)
        tag_end = xml_end.match(token)

        if tag_begin:
            tag = tag_begin.group(1)
            output[tag] = []
        elif tag_end:
            assert tag_end.group(1) == tag, 'Bad XML nesting: %s != %s' % (tag_end.group(1), tag)
            tag = missing
        else:
            token = re.sub(r'\W+', '', token)
            token = token.lower()
            if len(token) > 0:
                output[tag].append(token)
    if 'address' in output.keys():
        output['address'] = [''.join(output['address'])]
    return output

In [None]:
def extract_paper_features_from_processed_xml_info(input_information):
    input_information
    features = []

    # preprocessings:
    if 'author' in input_information.keys():
        input_information['author'] = [e for e in input_information['author'] if len(e) > 2]

    input_information['title'] = [e for e in input_information['title'] if len(e) > 3]
    if 'author' in input_information.keys():
        for i in range(max(3, len(input_information['author']))):
            if i >= len(input_information['author']):
                features.append(
                    'author%d=missing' % i
                )
            else:
                features.append(
                    'author{}={}'.format(i, input_information['author'][i])
                )
    else:
        for i in range(3):
            features.append(
                    'author%d=missing' % i
                )
            
    for i in range(max(6, len(input_information['title']))):
        if i >= len(input_information['title']):
            features.append(
                'title%d=missing' % i
            )
        else:
            features.append(
                'title{}={}'.format(i, input_information['title'][i])
            )

    if 'publisher' in input_information.keys():   
        features.append(
            'publisher={}'.format(input_information['publisher'][0])
        )
    else:
        features.append(
            'publisher=missing'
        )

    if 'address' in input_information.keys():   
        features.append(
            'address={}'.format(input_information['address'][0])
        )
    else:
        features.append(
            'address=missing'
        )
    if 'year' in input_information.keys():   
        features.append(
            'year={}'.format(input_information['year'][0])
        )
    else:
        features.append(
            'year=missing'
        ) 
    return features

### Build a table for id -> features, id -> ps

In [None]:
papers = {}
features_for_id = {}
raw_xml_info_for_id = {}
with open(dataset_path + 'papers', 'r') as fid:
    for f in fid.readlines():
        elements = f.strip().split('\t')
        if int(elements[0]) in papers:
            papers[int(elements[0])].append(elements[1])
            continue
        papers[int(elements[0])] = [elements[1]]
        if len(elements) == 3:
            raw_xml_info_for_id[int(elements[0])] = elements[2]
            features_for_id[int(elements[0])] = extract_paper_features_from_processed_xml_info(
                process_xml_info_in_papers_file(elements[2])
            )

In [None]:
papers[18]

In [None]:
features_for_id[18]

In [None]:
raw_xml_info_for_id[18]

### Re-structuring

In [None]:
output_dataset = {}

In [None]:
label_for_id = {}

for pid in papers.keys():
    for ps_file in papers[pid]:
        if ps_file in classifications:
            label_for_id[pid] = classifications[ps_file]
            break
    if pid not in label_for_id:
        label_for_id[pid] = 'NOLABEL'

In [None]:
output_dataset['postscript_for_id'] = papers
output_dataset['label_for_id'] = label_for_id
output_dataset['edges'] = output_citations
output_dataset['features_for_id'] = features_for_id

In [None]:
pickle.dump(output_dataset, open(dataset_path + 'restructured_dataset_v2.pkl', 'wb'))

In [None]:
print(len(output_dataset['postscript_for_id']))
print(len(output_dataset['label_for_id']))
print(len(output_dataset['edges']))
print(len(output_dataset['features_for_id']))

In [None]:
print(output_dataset['postscript_for_id'][18])
print(output_dataset['label_for_id'][18])
print(output_dataset['edges'][18])
print(output_dataset['features_for_id'][18])