In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import pickle
import sklearn
import sys

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
%aimport data.load_bibsonomy
%aimport features.build_features
%aimport helpers.files,helpers.labels

In [4]:
from data.load_bibsonomy import load_bibtex_and_tags, load_bookmark_and_tags
from helpers.files import get_directory_name_from_hash
from helpers.labels import truncate_labels
from features.build_features import clean_text_bibtex,clean_text_bookmark

In [5]:
pd.options.display.max_columns = 40

In [34]:
ROOT = "/media/felipe/SAMSUNG/ecml-pkdd-2009/train/2009-01-01_cleaned_post-core-2/"
BIBTEX = ROOT+"bibtex-unix"
BOOKMARK = ROOT+"bookmark-unix"
TAS = ROOT+"tas"
INTERIM_DATA_ROOT = "../data/interim/ecml-pkdd-2009/"
MAX_NB_WORDS = 20000
LABELS_MIN_DOC_FRACTION = 0.001

In [35]:
if os.path.isfile(INTERIM_DATA_ROOT+"bibtex_docs_df.p"):
    bibtex_docs_df = pickle.load(open(INTERIM_DATA_ROOT+"bibtex_docs_df.p", "rb"))
else:
    bibtex_docs_df = load_bibtex_and_tags(BIBTEX,TAS)
    pickle.dump(bibtex_docs_df,open(INTERIM_DATA_ROOT+"bibtex_docs_df.p","wb"))

In [36]:
if os.path.isfile(INTERIM_DATA_ROOT+"bookmark_docs_df.p"):
    bookmark_docs_df = pickle.load(open(INTERIM_DATA_ROOT+"bookmark_docs_df.p", "rb"))
else:
    bookmark_docs_df = load_bookmark_and_tags(BOOKMARK,TAS)
    pickle.dump(bookmark_docs_df,open(INTERIM_DATA_ROOT+"bookmark_docs_df.p","wb"))

In [37]:
len(bibtex_docs_df),len(bookmark_docs_df)

(22846, 40291)

## bibtex

In [38]:
bibtex_docs_df.fillna('',inplace=True)

In [39]:
bibtex_docs_df['contents'] = bibtex_docs_df['title']+ ' ' + \
bibtex_docs_df['bibtexAbstract'] +' '+ \
bibtex_docs_df['description'] + ' ' + \
bibtex_docs_df['note'] + ' ' + \
bibtex_docs_df['annote']

In [40]:
tag_sets = bibtex_docs_df["tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [41]:
len(all_tags)

5815

In [42]:
min_nb_docs = int(len(bibtex_docs_df)* LABELS_MIN_DOC_FRACTION)
min_nb_docs

22

In [43]:
bibtex_docs_df["tags_split"] = bibtex_docs_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = bibtex_docs_df["tags_split"].values

In [44]:
truncated_labels = truncate_labels(labels,min_nb_docs)

In [45]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [46]:
binary_label_data.shape

(22846, 534)

In [47]:
data = bibtex_docs_df["contents"].values

In [48]:
data[334][:1000]

'Cognitive support for ontology modeling Knowledge engineering tools are becoming ever more complex, and therefore increased cognitive support will be necessary to leverage the potential of those tools. Our paper motivates this claim by examining some previous work in this domain and explaining the nature of cognitive support. We discuss some of the problem areas we have encountered in our research. Through user questionnaires and observations carried out at the National Cancer Institute (NCI) and the University of Washington Foundational Model of Anatomy (FMA) Project, we have begun to gain an understanding of the cognitive barriers experienced by the users of knowledge engineering tools. We present some proposed solutions that could address the problems we identified, and in addition, discuss how our own tool, called Jambalaya, could be applied to provide cognitive support. We analyse the support Jambalaya provides using some non-functional design criteria and illustrate some trade-o

In [49]:
pipeline = Pipeline([
    ('vect', CountVectorizer(preprocessor=clean_text_bibtex, max_features=MAX_NB_WORDS, strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

In [50]:
# use later
# parameters = {
#     'vect__preprocessor': clean_text,
#     "vect__max_features": [MAX_NB_WORDS]
# }

In [51]:
scores = cross_val_score(pipeline, data, binary_label_data, cv=5,scoring='f1_micro')

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [52]:
scores.mean()

0.24819728525587056

## bookmark

In [56]:
bookmark_docs_df.fillna('',inplace=True)

In [58]:
bookmark_docs_df

Unnamed: 0,content_id,url_hash,url,description,extended_description,date,tags
0,8,7edfc1f9560521e83bcf5a5768889c6c,http://jo.irisson.free.fr/bstdatabase/,LaTeX Bibliography Styles Database :: Search,,2005-12-13 08:42:37,"latex,bibtex,database,style"
1,11,e636edf2736cfc61897bf21039ffea1b,http://acmqueue.com/modules.php?name=Content&p...,Social Bookmarking in the Enterprise,,2005-12-07 09:08:51,"tagging,enterprise,ibm,folksonomy"
2,12,2f87c060c8ada01d4500e8a27749dee8,http://www.cs.stir.ac.uk/~kjt/software/latex/s...,BibTeX Style Examples,,2005-12-06 13:57:37,"latex,bibtex,example"
3,13,bfb258bc024470f88f8d38c2c4d820ab,http://virtual.cvut.cz:8080/ksmsaWeb/browser/t...,The KSMSA Project - Ontology Browser,,2005-12-02 09:46:10,"ontology,sumo,browser"
4,18,0f12bf6a77453df42bd6cf8ccb9ff10a,http://www.cs.utexas.edu/users/mfkb/related.html,KBS/Ontology Projects Worldwide,,2005-11-24 19:09:42,ontology
5,21,f7186c61437b071849077a9d74b3a1d1,http://pier.cs.berkeley.edu/,The PIER Project,,2005-11-22 09:25:15,"p2p,simulator,simulation"
6,23,1c8096ac2d56e1c521b605aa3b9b151f,http://www.last.fm/,Last.fm,,2005-11-18 11:11:35,"tagging,collaborative,folksonomy,music"
7,29,19cfec376f5b0e4ccddfdc802decdbb9,http://www.alphaworks.ibm.com/tech/uima,alphaWorks : Unstructured Information Manageme...,,2005-11-10 11:37:45,"datamining,machinelearning,framework"
8,30,1dd1689b0613587b44aa3422b78b6e16,http://www.research.ibm.com/UIMA/,UIMA,,2005-11-10 11:37:36,"datamining,machinelearning,framework"
9,32,4986c0ef7bf79413faf19f7c9b48eda5,http://www.analytictech.com/ucinet.htm,UCINET 6 Social Network Analysis Software,,2006-09-06 10:34:37,"sna,tool"


In [59]:
bookmark_docs_df['contents'] = bookmark_docs_df['url']+ ' ' + \
bookmark_docs_df['description'] +' '+ \
bookmark_docs_dfbibtex_docs_df['extended_description']

In [60]:
tag_sets = bookmark_docs_df["tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [61]:
len(all_tags)

10663

In [64]:
min_nb_docs = int(len(bookmark_docs_df)* LABELS_MIN_DOC_FRACTION)
min_nb_docs

40

In [65]:
bookmark_docs_df["tags_split"] = bookmark_docs_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = bookmark_docs_df["tags_split"].values

In [66]:
truncated_labels = truncate_labels(labels,min_nb_docs)

In [67]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [68]:
binary_label_data.shape

(40291, 622)

In [70]:
data = bookmark_docs_df["contents"].values

In [71]:
pipeline = Pipeline([
    ('vect', CountVectorizer(preprocessor=clean_text_bibtex, max_features=MAX_NB_WORDS, strip_accents='unicode')),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

In [72]:
scores = cross_val_score(pipeline, data, binary_label_data, cv=5,scoring='f1_micro')

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [73]:
scores.mean()

0.28140036541343694