In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import pickle
import sklearn
import sys

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [15]:
%aimport data.load_delicious
%aimport features.build_features
%aimport helpers.files,helpers.labels

In [16]:
from data.load_delicious import load_taginfo_into_dataframe
from helpers.files import get_directory_name_from_hash
from helpers.labels import truncate_labels
from features.build_features import clean_text

In [24]:
ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140/"
TAGINFO = ROOT+"taginfo.xml"
INTERIM_DATA_ROOT = "../data/interim/"
MAX_NB_WORDS = 2000
LABELS_MIN_DOC_FRACTION = 0.01

In [6]:
if os.path.isfile(INTERIM_DATA_ROOT+"docs_df.p"):
    docs_df = pickle.load( open( "docs_df.p", "rb" ))
else:
    docs_df = load_taginfo_into_dataframe(TAGINFO)
    pickle.dump(docs_df,open(INTERIM_DATA_ROOT+"docs_df.p","wb"))

In [8]:
num_documents = len(docs_df)
num_documents

143716

In [9]:
docs_df.head(10)

Unnamed: 0,filename,filetype,hash,tags,url,num_users,num_tags
0,66fa11b33b6ac183314892703c20fa47.html,html,66fa11b33b6ac183314892703c20fa47,"python,programming,standards,style,coding,refe...",http://lists.osafoundation.org/pipermail/dev/2...,27,8
1,21899d001299ceadc852ed22e1b2b725.html,html,21899d001299ceadc852ed22e1b2b725,"compiler,programming,llvm,vm,development,compi...",http://llvm.org/,830,25
2,bd7c9734cd1a5c8a55328a1a9ce4d4d2.html,html,bd7c9734cd1a5c8a55328a1a9ce4d4d2,"linux,software,opensource,ubuntu,windows,alter...",http://linuxappfinder.com/alternatives,47,19
3,ff186471d34e1440845b80d61733f8ef.html,html,ff186471d34e1440845b80d61733f8ef,"tomboy,gnome,linux,software,wiki,notes",http://live.gnome.org/Tomboy,7,6
4,cc7afd7b1b9e0c29ba72978b5edd8ed5.html,html,cc7afd7b1b9e0c29ba72978b5edd8ed5,"blogs,teaching,web2.0",http://lisahistory.net/wordpress/,7,3
5,e9466d13558200b7b084fae5d0d81b57.html,html,e9466d13558200b7b084fae5d0d81b57,"books,literature,search,recommendations,refere...",http://literature-map.com/,373,25
6,2e14957206bad2bdf5fb29564c6c863c.html,html,2e14957206bad2bdf5fb29564c6c863c,"deviantart,photography,inspiration",http://littlemewhatever.deviantart.com/,15,3
7,e84764f03cc25dcf50207bfa534a9e84.html,html,e84764f03cc25dcf50207bfa534a9e84,"phone,mobile,cellphone,sprint,cell,apps,treo,t...",http://livemobile.blogspot.com/,55,13
8,a6f8b9ccfe1219ab79be4001c08b31ab.html,html,a6f8b9ccfe1219ab79be4001c08b31ab,"video,videos,bizarre,weird,movies,lists,cool,s...",http://listverse.com/bizarre/top-10-most-bizar...,64,15
9,72c4c07a25937957f599c7524f94cd75.html,html,72c4c07a25937957f599c7524f94cd75,"movies,film,cool,lists",http://listverse.com/entertainment/top-15-indi...,8,4


In [10]:
# TODO optimize this because currently this does one I/O OP per loop
def load_contents(hash):
    file_path = ROOT+"fdocuments/"+get_directory_name_from_hash(hash)+"/"+hash+".html"
       
    with open(file_path,"r",encoding='utf-8', errors='ignore') as f:
        contents = f.read()
        
    return contents

In [11]:
%%time

if os.path.isfile(INTERIM_DATA_ROOT+"sample_df.p"):
    sample_df = pickle.load(open("sample_df.p", "rb"))
else:
    random_indices = np.random.choice(docs_df.index.values, int(num_documents/50), replace=False)
    sample_df = docs_df.loc[random_indices]
    sample_df = sample_df.reset_index().drop(['index'],axis=1)
    sample_df['contents'] = sample_df['hash'].map(lambda hash: load_contents(hash))
    pickle.dump(sample_df,open(INTERIM_DATA_ROOT+"sample_df.p","wb"))

CPU times: user 1.86 s, sys: 860 ms, total: 2.72 s
Wall time: 2min 48s


In [12]:
num_documents = len(sample_df)
num_documents

2874

In [13]:
sample_df.head(10)

Unnamed: 0,filename,filetype,hash,tags,url,num_users,num_tags,contents
0,37296e86ac599d7fe9aaff17a3037400.html,html,37296e86ac599d7fe9aaff17a3037400,"flash,air,adobe,blog,flex,design,webdesign,act...",http://www.webkitchen.be/,65,23,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,0b294ed63a75b9d264af303f5c5eeea9.html,html,0b294ed63a75b9d264af303f5c5eeea9,"french,language,franÃ§ais,resources,education,...",http://www.uni.edu/becker/french31.html,123,21,<HTML>\n<HEAD>\n\n<TITLE>Best French Websites<...
2,5d7ec003095ae0f3123b2e3da4b3fb95.html,html,5d7ec003095ae0f3123b2e3da4b3fb95,"music,mp3,history,free,audio,downloads,songs,d...",http://www.foldedspace.org/weblog/2006/06/in_t...,566,25,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
3,8ca4efc75f6399dfe7e4bffbfc0c08d4.html,html,8ca4efc75f6399dfe7e4bffbfc0c08d4,"ssh,sftp,security,linux,database,backup,howto,...",http://ask-leo.com/how_can_i_automate_an_sftp_...,21,10,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
4,ce9a31b508d0505b0e4cb2bbf4fe3d55.html,html,ce9a31b508d0505b0e4cb2bbf4fe3d55,"ajax,rails,tutorial",http://railsonedge.blogspot.com/2008/03/tutori...,7,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S..."
5,1bcbe8e8838244aca41bfbe69a958f24.html,html,1bcbe8e8838244aca41bfbe69a958f24,"silverlight,charts,opensource,chart,.net,free,...",http://www.visifire.com/silverlight_charts_gal...,47,15,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
6,8384c9cae634c9dff115a741926e0fc9.html,html,8384c9cae634c9dff115a741926e0fc9,"parenting,education,baby,children,learning,lan...","http://www.time.com/time/health/article/0,8599...",121,25,<!--[if IE 5]> Vignette StoryServer 5.0 Fri Ma...
7,2ad1b840589b35378c8757739a461b1d.html,html,2ad1b840589b35378c8757739a461b1d,"language,dictionary,swear,fun,reference,funny,...",http://www.youswear.com/index.asp,18,14,
8,78f632abdab2116c806b8ab81029fce7.html,html,78f632abdab2116c806b8ab81029fce7,"history,image,compression,lenna,fun,internet,i...",http://www.cs.cmu.edu/~chuck/lennapg/,58,25,<html>\n\n<head>\n<title>The Rest of the Lenna...
9,8c1fd30e1547407eab96254252d433d4.html,html,8c1fd30e1547407eab96254252d433d4,"science,space,astronomy",http://dvice.com/archives/2008/06/astronomers_...,4,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."


In [14]:
tag_sets = sample_df["tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [17]:
len(all_tags)

7201

In [25]:
min_nb_docs = int(len(sample_df)* LABELS_MIN_DOC_FRACTION)
min_nb_docs

28

In [26]:
sample_df["tags_split"] = sample_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = sample_df["tags_split"].values

In [27]:
truncated_labels = truncate_labels(labels,min_nb_docs)

In [28]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [29]:
binary_label_data.shape

(2874, 242)

In [30]:
data = sample_df["contents"].values

In [31]:
data[0][:1000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head profile="http://gmpg.org/xfn/11">\n<title>Serge Jespers</title>\n<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\n<meta http-equiv="pragma" content="no-cache" />\n<meta http-equiv="cache-control" content="no-cache" />\n<link rel="stylesheet" type="text/css" href="http://www.webkitchen.be/wp-content/themes/smashingtheme/style.css" />\n<script type="text/javascript" src="http://www.webkitchen.be/wp-content/themes/smashingtheme/javascript/imghover.js"> </script>\n<link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="http://www.webkitchen.be/feed/" />\n<link rel="alternate" type="text/xml" title="RSS .92" href="http://www.webkitchen.be/feed/rss/" />\n<link rel="alternate" type="application/atom+xml" title

In [32]:
os.environ["CLASSPATH"]="/home/felipe/auto-tagger/data/stanford-postagger/stanford-postagger-2017-06-09"
clean_text(data[0])[:1000]

" Serge Jespers window.onload = function() {var options ={assetURL:'',loadingImage:'http://www.webkitchen.be/wp-content/plugins/pb-embedflash/css/images/loading.gif',flvPlayer:'http://www.webkitchen.be/wp-content/plugins/pb-embedflash/swf/mediaplayer.swf',animate:true,animSequence:'wh',overlayColor:'#000',overlayOpacity:0.85,overlayBgImage:'http://www.webkitchen.be/wp-content/plugins/pb-embedflash/css/images/overlay-85.png',listenOverlay:true,autoplayMovies:true,showMovieControls:true,resizeDuration:0.35,fadeDuration:0.35,displayNav:true,continuous:false,displayCounter:true,counterType:'default',viewportPadding:20,handleLgImages:'resize',initialHeight:160,initialWidth:320,enableKeys:true,keysClose:['c', 'q', 27],keysPrev:['p', 37],keysNext:['n', 39],handleUnsupported:'',text: {cancel:'Cancel',loading: 'loading',close:'Close',next:'Next',prev:'Previous',errors:{single: 'You must install the {1} browser plugin to view this content.',shared: 'You must install both the {1} and {3} browser 

In [33]:
pipeline = Pipeline([
    ('vect', CountVectorizer(preprocessor=clean_text, max_features=MAX_NB_WORDS)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

In [34]:
# use later
parameters = {
    'vect__preprocessor': clean_text,
    "vect__max_features": [MAX_NB_WORDS]
}

In [36]:
cross_val_score(pipeline, data, binary_label_data, cv=5,scoring='f1_micro')

array([ 0.21303116,  0.21857345,  0.22498147,  0.22186322,  0.21688475])