* Goal: Preprocess the dblp.xml.gz dataset downloaded from https://dblp.uni-trier.de/xml/
* Approach: 
    - Unzip file 
    - Convert xml to JSON
    - Convert JSON to pandas
    - Keep only interesting rows and filter missing values
    - Serliaze dataframe for later use

### Open XML file containing the data

In [1]:
import gzip
from IPython.display import display, clear_output


contributions_xml = gzip.open('../00_Data/dblp.xml.gz','r')

### Convert XML file to JSON

In [2]:
types = ['article','inproceedings','www','proceedings','book','incollection','phdthesis','mastersthesis','www']
contributions =  {t:[] for t in types}
begin = False

for key,line in enumerate(contributions_xml):
    line = str(line)
    for t in types:
        if '</' + t in line:
            begin = False
            contributions[t].append(contribution)
            break
    
    if begin:
        if '<' in line:
            vals = line.replace('<','>').split('>')
            vals[1] = vals[1].split(' ')[0]
            if vals[1] in contribution:
                contribution[vals[1]] += ', ' + vals[2]
            else:
                contribution[vals[1]] = vals[2]
            
    for t in types:
        if '<' + t in line:
            begin = True
            contribution = {}
            break

    if key % 50000 == 0 and key != 0:
        clear_output(wait=True)
        print("Imported: {}".format(sum([len(contributions[t]) for t in types])))

Imported: 11616873


### Convert JSON to pandas

In [3]:
import pandas as pd

data = []
for t in types:
    data.extend(contributions[t])

df = pd.DataFrame.from_dict(data)

### Keep only columns ['title','author(s)','year'] and filter data

In [4]:
df = df[['author','title','year']].dropna()

### Serliaze dataframe for later usage

In [5]:
df.to_pickle('../00_Data/dblp.pickle')