# Pre-processing and Statistical Overview of CORA Dataset

## Preparations

### Parameters

In [None]:
dataset_path = './data/'

### Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import _pickle as pickle
import pdb
import re

xml_begin = re.compile('<(\w+)>')
xml_end = re.compile('</(\w+)>')

### Custom Libraries

## Processing

In [None]:
output = []
with open(dataset_path + 'citations', 'r') as fid:
    for f in fid.readlines():
        output.append(f.strip().replace('\t', ','))

In [None]:
output_citations = {}
for element in output:
    if int(element.split(',')[0]) not in output_citations.keys():
        output_citations[int(element.split(',')[0])] = []
    output_citations[int(element.split(',')[0])].append(int(element.split(',')[1]))

In [None]:
print('number of keys: {}'.format(len(output_citations.keys())))

In [None]:
#how many papers (unique paper ids) have citations (are cited by >0 papers)
print('number of unique citeds (values): %d' % len(set([item for sublist in output_citations.values() for item in sublist]))) 

In [None]:
classifications = []
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        classifications.append(f.strip().replace('\t', ','))

In [None]:
labels = (set([k.split(',')[1] for k in classifications[:-1]]))

In [None]:
# demoing the values in the classification list, including the classes
classifications[:5]

In [None]:
pruned_classes_for_ps = {}
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        string_values = f.strip().split('\t') ### BAD
        if len(string_values) > 1:
            pruned_classes_for_ps[string_values[0]] = string_values[1]
            
pruned_classes_for_ps['ftp:##ftp.eecs.umich.edu#people#fessler#ps#93,isit,hero.ps.Z'] # outputs mscs.ps.Z which is wrong

### Processing xml info

In [None]:
def process_xml_info_in_papers_file(xml_info):
    output = {}
    missing = 'missing'
    tag = missing
    output[tag] = []
    tokens = xml_info.split()
    for token in tokens:
        tag_begin = xml_begin.match(token)
        tag_end = xml_end.match(token)

        if tag_begin:
            tag = tag_begin.group(1)
            output[tag] = []
        elif tag_end:
            assert tag_end.group(1) == tag, 'Bad XML nesting: %s != %s' % (tag_end.group(1), tag)
            tag = missing
        else:
            token = re.sub(r'\W+', '', token)
            token = token.lower()
            if len(token) > 0:
                output[tag].append(token)
    if 'address' in output.keys():
        output['address'] = [''.join(output['address'])]
    return output

In [None]:
def extract_paper_features_from_processed_xml_info(input_information):
    input_information
    features = []

    # preprocessings:
    if 'author' in input_information.keys():
        input_information['author'] = [e for e in input_information['author'] if len(e) > 2]

    input_information['title'] = [e for e in input_information['title'] if len(e) > 3]
    if 'author' in input_information.keys():
        for i in range(max(3, len(input_information['author']))):
            if i >= len(input_information['author']):
                features.append(
                    'author%d=missing' % i
                )
            else:
                features.append(
                    'author{}={}'.format(i, input_information['author'][i])
                )
    else:
        for i in range(3):
            features.append(
                    'author%d=missing' % i
                )


    
    for i in range(max(6, len(input_information['title']))):
        if i >= len(input_information['title']):
            features.append(
                'title%d=missing' % i
            )
        else:
            features.append(
                'title{}={}'.format(i, input_information['title'][i])
            )

    if 'publisher' in input_information.keys():   
        features.append(
            'publisher={}'.format(input_information['publisher'][0])
        )
    else:
        features.append(
            'publisher=missing'
        )

    if 'address' in input_information.keys():   
        features.append(
            'address={}'.format(input_information['address'][0])
        )
    else:
        features.append(
            'address=missing'
        )
    if 'year' in input_information.keys():   
        features.append(
            'year={}'.format(input_information['year'][0])
        )
    else:
        features.append(
            'year=missing'
        ) 
    return features

In [None]:
papers = []
features_for_id = {}
raw_xml_info_for_id = {}
with open(dataset_path + 'papers', 'r') as fid:
    for f in fid.readlines():
        elements = f.strip().split('\t')
        papers.append([int(elements[0]), elements[1]])
        if len(elements) == 3:
            raw_xml_info_for_id[int(elements[0])] = elements[2]
            features_for_id[int(elements[0])] = extract_paper_features_from_processed_xml_info(
                process_xml_info_in_papers_file(elements[2])
            )

In [None]:
# TODO: WHY IS THIS ASSERT FALSE? (according to the paper though, removing duplicates is good enough)
# @shayan. What do you mean here? See papers[:5] and notice that papers actually has duplicates 
# and there haven't been removed yet

# SHAYAN: this document is not presentation-ready, these were just my debuggings.
len(set([f[0] for f in papers])) == len([f[0] for f in papers])

In [None]:
len(classifications[:-1])

In [None]:
papers[1]

In [None]:
features_for_id[16]

In [None]:
raw_xml_info_for_id[16]

### Re-structuring

In [None]:
papers[2]

In [None]:
output_dataset = {}

In [None]:
postscript_for_id = {}
for element in papers:
    postscript_for_id[element[0]] = element[1]

In [None]:
#TODO: because of the above todo to pruned_classes_for_ps there might be some changes here.
#Even if no code, output definitely changes
label_for_id = postscript_for_id
for id_value in postscript_for_id.keys():
    ps_file = postscript_for_id[id_value]
    if ps_file in pruned_classes_for_ps.keys():
        label_for_id[id_value] = pruned_classes_for_ps[ps_file]
    else:
        label_for_id[id_value] = 'NOLABEL'

In [None]:
edges = output_citations

In [None]:
output_dataset['postscript_for_id'] = postscript_for_id
output_dataset['label_for_id'] = label_for_id
output_dataset['edges'] = edges
output_dataset['features_for_id'] = features_for_id

In [None]:
pickle.dump(output_dataset, open(dataset_path + 'restructured_dataset.pkl', 'wb'))

In [None]:
edges[16]

In [None]:
print(len(output_dataset['postscript_for_id']))
print(len(output_dataset['label_for_id']))
print(len(output_dataset['edges']))
print(len(output_dataset['features_for_id']))

In [None]:
print(output_dataset['postscript_for_id'][18])
print(output_dataset['label_for_id'][18])
print(output_dataset['edges'][18])
print(output_dataset['features_for_id'][18])