# Pre-processing and Statistical Overview of CORA Dataset

## Preparations

### Parameters

In [1]:
dataset_path = './data/'

### Libraries

In [60]:
import pandas as pd
import numpy as np
import os
import sys
import _pickle as pickle

### Custom Libraries

## Processing

In [5]:
output = []
with open(dataset_path + 'citations', 'r') as fid:
    for f in fid.readlines():
        output.append(f.strip().replace('\t', ','))

In [6]:
output_citations = {}
for element in output:
    output_citations[int(element.split(',')[0])] = int(element.split(',')[1])

In [10]:
output_citations

{172005: 11,
 9351: 26,
 116552: 48,
 36213: 59195,
 59444: 89,
 53462: 5903,
 39403: 131,
 129532: 136,
 38318: 113,
 293149: 733703,
 201: 397331,
 1102349: 231,
 1008433: 1008433,
 1152067: 281,
 107173: 293,
 54784: 319,
 343: 1149,
 382: 389,
 404: 462729,
 424: 3254,
 267003: 444,
 65335: 459,
 58454: 476,
 838152: 490,
 309083: 495,
 1102364: 685100,
 113508: 586,
 589: 184063,
 645: 34610,
 57874: 688,
 610702: 9657,
 298870: 745,
 476718: 760,
 1102372: 794,
 23879: 808,
 57932: 815,
 837: 842,
 1102376: 863,
 1102377: 892,
 900: 17848,
 34355: 328498,
 3828: 952,
 6485: 976,
 62474: 34768,
 310929: 1005,
 1102386: 1014,
 1102387: 1017,
 1034: 1041,
 248841: 1044,
 18130: 1054,
 1152068: 1056,
 1067: 1067,
 1068: 241554,
 1102396: 915,
 163235: 1185,
 869983: 134251,
 1089479: 69619,
 1102400: 152226,
 67537: 1265,
 1102402: 72919,
 1102403: 1284,
 1102404: 1291,
 54664: 121067,
 57596: 1322,
 1152069: 1364,
 1102407: 1366,
 1102408: 1370,
 1102409: 1386,
 1402: 11205,
 1416: 

In [12]:
print('number of keys: {}'.format(len(output_citations.keys())))

number of keys: 35788


In [14]:
print('number of unique citeds (values): %d' % len(set(output_citations.values())))

number of unique citeds (values): 21886


In [47]:
classifications = []
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        classifications.append(f.strip().replace('\t', ','))

In [48]:
labels = (set([k.split(',')[1] for k in classifications[:-1]]))

In [49]:
# demoing the values in the classification list, including the classes
classifications[:5]

['http:##www.isi.edu#sims#papers#94-sims-agents.ps,/Information_Retrieval/Retrieval/',
 'http:##www.cis.ohio-state.edu#~ren#tois.ps,/Information_Retrieval/Retrieval/',
 'ftp:##ftp.cs.umass.edu#pub#techrept#techreport#1996#UM-CS-1996-002.ps,/Information_Retrieval/Retrieval/',
 'http:##www.cs.cmu.edu#afs#cs#user#alex#docs#idvl#dl97.ps,/Information_Retrieval/Retrieval/',
 'http:##www.ri.cmu.edu#afs#cs#user#alex#docs#idvl#dl97.ps,/Information_Retrieval/Retrieval/']

In [53]:
pruned_classes_for_ps = {}
with open(dataset_path + 'classifications', 'r') as fid:
    for f in fid.readlines():
        string_values = (f.strip().replace('\t', ',')).split(',')
        if len(string_values) > 1:
            pruned_classes_for_ps[string_values[0]] = string_values[1]

In [30]:
papers = []
with open(dataset_path + 'papers', 'r') as fid:
    for f in fid.readlines():
        papers.append([int(f.strip().replace('\t', ',').split(',')[0]), f.strip().replace('\t', ',').split(',')[1]])

In [36]:
# TODO: WHY IS THIS ASSERT FALSE? (according to the paper though, removing duplicates is good enough)
len(set([f[0] for f in papers])) == len([f[0] for f in papers])

False

In [23]:
len(classifications[:-1])

30787

In [29]:
papers[0]

'2,http:##dimacs.rutgers.edu#techps#1994#94-07.ps,[Gar] <author> M.R. Garey & D.S. Johnson, </author> <title> Computers and Intractibility: A Guide to the Theory of NP-Completeness, W.H. </title> <publisher> Freeman, </publisher> <address> New York, </address> <year> 1979. </year>'

### Re-structuring

In [37]:
output_dataset = {}

In [46]:
postscript_for_id = {}
for element in papers:
    postscript_for_id[element[0]] = element[1]

In [55]:
label_for_id = postscript_for_id
for id_value in postscript_for_id.keys():
    ps_file = postscript_for_id[id_value]
    if ps_file in pruned_classes_for_ps.keys():
        label_for_id[id_value] = pruned_classes_for_ps[ps_file]
    else:
        label_for_id[id_value] = 'NOLABEL'

In [57]:
edges = output_citations

In [58]:
output_dataset['postscript_for_id'] = postscript_for_id
output_dataset['label_for_id'] = label_for_id
output_dataset['edges'] = edges

In [62]:
pickle.dump(output_dataset, open(dataset_path + 'restructured_dataset.pkl', 'wb'))