In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
import gc

from joblib import Parallel, delayed
from tqdm import *

%matplotlib inline

In [2]:
pd.options.display.max_columns = 40

In [3]:
ROOT = "/media/felipe/SAMSUNG/ecml-pkdd-2009/train/2009-01-01_cleaned_post-core-2/"

command to clean the bibtex file to make it amenable to unix parsing:
    
    cat bibtex | sed 's/\\n / /g' | tr -d '\r' | sed -e '1h;2,$H;$!d;g' -e 's/\\\n/ /g' | tr -s " " > bibtex-unix

## bibtex

In [4]:
bib_path = ROOT+"bibtex-unix"
bibtex_df = pd.read_csv(bib_path,sep='\t',error_bad_lines=False,names=[
    'content_id','journal','volume','chapter','edition','month', 'day', 'booktitle', 'howPublished',
'institution', 'organization','publisher','address','school','series','bibtexKey','url','type','description','annote',
'note','pages','bKey','number','crossRef','misc','bibtexAbstract','hash0','hash1','hash2','entrytype','title','author',
'editor','year'])

In [5]:
bibtex_df.drop(['hash0','hash1','hash2','number','pages','volume','chapter','edition','month','day'],axis=1,inplace=True)

In [6]:
bibtex_df.head()

Unnamed: 0,content_id,journal,booktitle,howPublished,institution,organization,publisher,address,school,series,bibtexKey,url,type,description,annote,note,bKey,crossRef,misc,bibtexAbstract,entrytype,title,author,editor,year
0,687925,\N,\N,\N,\N,\N,American Mathematical Society,"Providence, Rhode Island",\N,American Mathematical Society Colloqium Publi,birkhoff40,\N,\N,references from my diploma thesis,\N,\N,\N,\N,\N,\N,book,Lattice Theory,Garrett Birkhoff,\N,1967
1,687933,Matematiceskij Sbornik,\N,\N,\N,\N,\N,\N,\N,\N,zaretski63,\N,\N,references from my diploma thesis,\N,(russisch),\N,\N,\N,\N,article,Die {H}albgruppe der binären {R}elationen,K. A. Zarecki,\N,1963
2,688157,\N,\N,\N,\N,\N,\N,\N,\N,\N,haveliwala03second,citeseer.ist.psu.edu/haveliwala03second.html,\N,kdd,\N,\N,\N,\N,text = {T. H. Haveliwala and S. D. Kamvar. The...,\N,misc,The second eigenvalue of the Google matrix,T. Haveliwala and S. Kamvar,\N,2003
3,688158,Computer Networks and ISDN Systems,\N,\N,\N,\N,\N,\N,\N,\N,brin98anatomy,\N,\N,kdd,\N,\N,\N,\N,doi = {10.1016/S0169-7552(98)00110-X},\N,article,{T}he {A}natomy of a {L}arge-{S}cale {H}yperte...,Sergey Brin and Lawrence Page,\N,1998
4,688159,IEEE Intelligent Systems,\N,\N,\N,\N,\N,\N,\N,\N,HAlani2003,\N,\N,kdd,\N,\N,\N,\N,doi = {10.1016/S0169-7552(98)00110-X},\N,article,{I}dentifying {C}ommunities of {P}ractice thro...,Harith Alani and Srinandan Dasmahapatra and Ki...,\N,2003


In [7]:
bibtex_df.count()

content_id        22846
journal           22788
booktitle         22773
howPublished      22838
institution       22845
organization      22845
publisher         22765
address           22831
school            22814
series            22795
bibtexKey         22844
url               22817
type              22846
description        9624
annote            22549
note              22811
bKey              22845
crossRef          22846
misc              13878
bibtexAbstract    22729
entrytype         22844
title             22844
author            22844
editor            22844
year              22844
dtype: int64

In [8]:
bibtex_df.dtypes

content_id         int64
journal           object
booktitle         object
howPublished      object
institution       object
organization      object
publisher         object
address           object
school            object
series            object
bibtexKey         object
url               object
type              object
description       object
annote            object
note              object
bKey              object
crossRef          object
misc              object
bibtexAbstract    object
entrytype         object
title             object
author            object
editor            object
year              object
dtype: object

In [9]:
bibtex_df.describe(include=['object'])

Unnamed: 0,journal,booktitle,howPublished,institution,organization,publisher,address,school,series,bibtexKey,url,type,description,annote,note,bKey,crossRef,misc,bibtexAbstract,entrytype,title,author,editor,year
count,22788,22773,22838,22845,22845,22765,22831,22814,22795,22844,22817,22846,9624,22549,22811,22845,22846,13878,22729,22844,22844,22844,22844,22844
unique,1823,3347,402,277,82,1145,861,217,193,12311,5721,237,1733,153,468,230,240,7569,3246,776,10222,8957,2181,707
top,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,\N,idsia,\N,\N,\N,\N,\N,\N,article,inproceedings,\N,\N,2006
freq,14899,14255,21986,22016,22655,13810,17312,22322,21514,370,12353,22344,2030,22256,21782,22522,22265,636,16479,8040,232,465,17917,2726


In [10]:
# everything but '\N'
bools = bibtex_df.isin(['\\N'])

In [11]:
notbools = bools.isin([False])

In [12]:
bibtex_df[notbools].describe(include=['object'])

Unnamed: 0,journal,booktitle,howPublished,institution,organization,publisher,address,school,series,bibtexKey,url,type,description,annote,note,bKey,crossRef,misc,bibtexAbstract,entrytype,title,author,editor,year
count,7889,8518,852,829,190,8955,5519,492,1281,22474,10464,502,8640,293,1029,323,581,13242,6250,22826,22837,22379,4927,22279
unique,1822,3346,401,276,81,1144,860,216,192,12310,5720,236,1732,152,467,229,239,7568,3245,775,10221,8956,2180,706
top,Neural Computation,WWW '06: Proceedings of the 15th international...,Paperback,IDSIA,{N}ature {P}ublishing {G}roup,ACM Press,"New York, NY, USA","New York, NY, USA",Lecture Notes in Computer Science,evr,http://arxiv.org/abs/cs.DL/0508082,http://semwiki.org/semwiki2006,idsia,"All references containing the term ""wiki"" from...",{\\em Togelius is working at IDSIA on SNF gran...,atick,http://data.semanticweb.org/conference/iswc-as...,comment = {alpha},"owner = {voelkel}, timestamp = {2006.06.14}",article,inproceedings,J. Schmidhuber,\,2006
freq,164,76,86,74,14,1113,1404,31,397,251,39,53,2030,42,38,6,45,165,49,8040,232,222,72,2726


## bookmark

command to clean the bookmark file to make it amenable to unix parsing:
    
    cat bookmark | sed 's/\\n / /g' | tr -d '\r' | sed -e '1h;2,$H;$!d;g' -e 's/\\\n/ /g' | sed 's/\xe2\x80\x9c/"/g' |  tr -s " " > bookmark-unix

In [35]:
bookmark_path = ROOT+"bookmark-unix"
bookmark_df = pd.read_csv(bookmark_path,sep='\t',error_bad_lines=False,names=[
    'content_id','url_hash','url','description','extended_description','date'])

In [37]:
bookmark_df.head()

Unnamed: 0,content_id,url_hash,url,description,extended_description,date
0,8,7edfc1f9560521e83bcf5a5768889c6c,http://jo.irisson.free.fr/bstdatabase/,LaTeX Bibliography Styles Database :: Search,,2005-12-13 08:42:37
1,11,e636edf2736cfc61897bf21039ffea1b,http://acmqueue.com/modules.php?name=Content&p...,Social Bookmarking in the Enterprise,,2005-12-07 09:08:51
2,12,2f87c060c8ada01d4500e8a27749dee8,http://www.cs.stir.ac.uk/~kjt/software/latex/s...,BibTeX Style Examples,,2005-12-06 13:57:37
3,13,bfb258bc024470f88f8d38c2c4d820ab,http://virtual.cvut.cz:8080/ksmsaWeb/browser/t...,The KSMSA Project - Ontology Browser,,2005-12-02 09:46:10
4,18,0f12bf6a77453df42bd6cf8ccb9ff10a,http://www.cs.utexas.edu/users/mfkb/related.html,KBS/Ontology Projects Worldwide,,2005-11-24 19:09:42
