In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import pickle
import sklearn
import sys

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [6]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [7]:
%aimport data.delicious_t140
%aimport features.build_features
%aimport helpers.files,helpers.labels

In [10]:
from data.delicious_t140 import load_or_get_from_cache
from helpers.labels import truncate_labels
from features.delicious_t140 import clean_text_delicious

In [29]:
ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
TAGINFO = ROOT+"/taginfo.xml"
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/delicious-t140/")
MAX_NB_WORDS = 2000
MIN_LABEL_DF = 2

In [20]:
if os.path.isfile(INTERIM_DATA_ROOT+"/docs_df.p"):
    docs_df = pickle.load(open(INTERIM_DATA_ROOT+"/docs_df.p", "rb" ))
else:
    docs_df = load_taginfo_into_dataframe(TAGINFO)
    pickle.dump(docs_df,open(INTERIM_DATA_ROOT+"docs_df.p","wb"))

In [21]:
num_documents = len(docs_df)
num_documents

143713

In [22]:
docs_df.head(10)

Unnamed: 0,filename,filetype,hash,unique_tags,url,num_users,num_unique_tags
0,66fa11b33b6ac183314892703c20fa47.html,html,66fa11b33b6ac183314892703c20fa47,"reference,conventions,standards,style,coding,g...",http://lists.osafoundation.org/pipermail/dev/2...,27,8
1,21899d001299ceadc852ed22e1b2b725.html,html,21899d001299ceadc852ed22e1b2b725,"llvm,software,language,framework,code,programm...",http://llvm.org/,830,24
2,bd7c9734cd1a5c8a55328a1a9ce4d4d2.html,html,bd7c9734cd1a5c8a55328a1a9ce4d4d2,"operating,freeware,application,education,apps,...",http://linuxappfinder.com/alternatives,47,17
3,ff186471d34e1440845b80d61733f8ef.html,html,ff186471d34e1440845b80d61733f8ef,"gnome,tomboy,software,linux,wiki,notes",http://live.gnome.org/Tomboy,7,6
4,cc7afd7b1b9e0c29ba72978b5edd8ed5.html,html,cc7afd7b1b9e0c29ba72978b5edd8ed5,"web20,blogs,teaching",http://lisahistory.net/wordpress/,7,3
5,e9466d13558200b7b084fae5d0d81b57.html,html,e9466d13558200b7b084fae5d0d81b57,"web20,library,author,writing,web,literatura,vi...",http://literature-map.com/,373,25
6,2e14957206bad2bdf5fb29564c6c863c.html,html,2e14957206bad2bdf5fb29564c6c863c,"deviantart,photography,inspiration",http://littlemewhatever.deviantart.com/,15,3
7,e84764f03cc25dcf50207bfa534a9e84.html,html,e84764f03cc25dcf50207bfa534a9e84,"free,cell,pda,phone,technology,cellphone,mobil...",http://livemobile.blogspot.com/,55,13
8,a6f8b9ccfe1219ab79be4001c08b31ab.html,html,a6f8b9ccfe1219ab79be4001c08b31ab,"list,cool,videos,youtube,video,bizarre,strange...",http://listverse.com/bizarre/top-10-most-bizar...,64,15
9,72c4c07a25937957f599c7524f94cd75.html,html,72c4c07a25937957f599c7524f94cd75,"film,lists,cool,movies",http://listverse.com/entertainment/top-15-indi...,8,4


In [23]:
# TODO optimize this because currently this does one I/O OP per loop
def load_contents(hash):
    file_path = ROOT+"fdocuments/"+get_directory_name_from_hash(hash)+"/"+hash+".html"
       
    with open(file_path,"r",encoding='utf-8', errors='ignore') as f:
        contents = f.read()
        
    return contents

In [24]:
%%time

if os.path.isfile(INTERIM_DATA_ROOT+"/sample_df.p"):
    sample_df = pickle.load(open(INTERIM_DATA_ROOT+"/sample_df.p", "rb"))
else:
    random_indices = np.random.choice(docs_df.index.values, int(num_documents/50), replace=False)
    sample_df = docs_df.loc[random_indices]
    sample_df = sample_df.reset_index().drop(['index'],axis=1)
    sample_df['contents'] = sample_df['hash'].map(lambda hash: load_contents(hash))
    pickle.dump(sample_df,open(INTERIM_DATA_ROOT+"/sample_df.p","wb"))

CPU times: user 296 ms, sys: 288 ms, total: 584 ms
Wall time: 2.18 s


In [25]:
num_documents = len(sample_df)
num_documents

2874

In [26]:
sample_df.head(10)

Unnamed: 0,filename,filetype,hash,unique_tags,url,num_users,num_unique_tags,contents
0,a6a80f81c0d3a4f8849e5cc449d65150.html,html,a6a80f81c0d3a4f8849e5cc449d65150,"quality,programming,process,testing,productivi...",http://www.developer.com/java/other/article.ph...,179,24,\n<HTML>\n<head>\n<TITLE>Effective Code Review...
1,3114064706973e8342a5eeae9f84d6c4.html,html,3114064706973e8342a5eeae9f84d6c4,"mysql,class,php",http://www.webmaster-talk.com/blog/17/php-mysq...,4,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
2,bf356f62f1888390bba62c71d7aaeb9b.html,html,bf356f62f1888390bba62c71d7aaeb9b,"dmca,canada,law,censorship,riaa,copyright",http://www.boingboing.net/2008/06/11/canadian-...,10,6,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<!...
3,0be0a4647229c10b8c93ffaadb60fece.html,html,0be0a4647229c10b8c93ffaadb60fece,"vegan,thai,recipes,tofu,peanut,vegetarian,vege...",http://blog.fatfreevegan.com/2008/06/tofu-and-...,11,7,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S..."
4,6bbba01a6e0adc99fbb0550119fde611.html,html,6bbba01a6e0adc99fbb0550119fde611,"scripting,digital,design,studio,architect,firm...",http://www.evandouglis.com/,30,14,"<?xml version=""1.0"" encoding=""iso-8859-1""?>\n<..."
5,b4fcffe47445a7e2dd27db4222d79268.html,html,b4fcffe47445a7e2dd27db4222d79268,"technology,informatics,health",http://www.hhs.gov/healthinformationtechnology/,6,3,<html>\n<head>\n<title>Office of the National ...
6,fa36867df24f9c8fe38c6b60a36de404.html,html,fa36867df24f9c8fe38c6b60a36de404,"scripting,programming,framework,unittest,libra...",http://twill.idyll.org/,596,25,"<?xml version=""1.0"" encoding=""utf-8"" ?>\n<!DOC..."
7,4bd1223d7fd689473c225cc1a612e70c.html,html,4bd1223d7fd689473c225cc1a612e70c,"packaging_design,embalagem,marketing,advertisi...",http://dzineblog.com/2008/04/packaging-design-...,198,22,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
8,e8d7218045dd6f75a34936ab72666690.html,html,e8d7218045dd6f75a34936ab72666690,"networking,blogging,plugins,widgets",http://docs.widgetbox.com/help/whats-a-widget/,5,4,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
9,161dfd01eedb4373e0ac31128820ef8b.html,html,161dfd01eedb4373e0ac31128820ef8b,"howto,pc,computers,shopping,store,mce,hardware...",http://www.pcalchemy.com/,250,25,"<html>\n<head>\n<meta name=""author"" CONTENT=""p..."


In [27]:
tag_sets = sample_df["unique_tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [28]:
len(all_tags)

6881

In [30]:
sample_df["tags_split"] = sample_df["unique_tags"].map(lambda tagstring: tagstring.split(","))
labels = sample_df["tags_split"].values

In [31]:
truncated_labels = truncate_labels(labels,MIN_LABEL_DF)

In [32]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [33]:
binary_label_data.shape

(2874, 2728)

In [34]:
data = sample_df["contents"].values

In [35]:
data[0][:1000]

'\n<HTML>\n<head>\n<TITLE>Effective Code Reviews Without the Pain</TITLE>\n<META NAME="description" CONTENT="Code reviews don\'t have to be painful. Find out how to make this process productive and pain-free for everyone.">\n<META NAME="keywords" CONTENT="">\n<META NAME="authors" CONTENT=" Robert Bogue">\n<META NAME="date" CONTENT="20060125">\n<META NAME="channel" CONTENT="ewsoftware">\n<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\n<LINK href="/css/text.css" type=text/css rel=stylesheet>\n<script language="JavaScript">\n<!--\nfunction MM_preloadImages() { //v3.0\n  var d=document; if(d.images){ if(!d.MM_p) d.MM_p=new Array();\n    var i,j=d.MM_p.length,a=MM_preloadImages.arguments; for(i=0; i<a.length; i++)\n    if (a[i].indexOf("#")!=0){ d.MM_p[j]=new Image; d.MM_p[j++].src=a[i];}}\n}\n\nfunction MM_swapImgRestore() { //v3.0\n  var i,x,a=document.MM_sr; for(i=0;a&&i<a.length&&(x=a[i])&&x.oSrc;i++) x.src=x.oSrc;\n}\n\nfunction MM_findObj(n, d) { //v3.0\n  va

In [37]:
os.environ["CLASSPATH"]="/home/felipe/auto-tagger/data/stanford-postagger/stanford-postagger-2017-06-09"
clean_text_delicious(data[0])[:1000]

' Effective Code Reviews Without the Pain 0&&parent.frames.length) { d=parent.frames[n.substring(p+1)].document; n=n.substring(0,p);} if(!(x=d[n])&&d.all) x=d.all[n]; for (i=0;!x&&i img { border: 0px;} #navitoolbarcontainer a { line-height: 16px; } IT internet.com/IT internet.com/CIO internet.com/Security internet.com/Networking internet.com/Storage bITa Planet CIO Update Database Journal Datamation Enterprise IT Planet Enterprise Networking Planet Enterprise Storage Forum eSecurity Planet Hardware Central Intranet Journal ISP Planet ITSMwatch IT Channel Planet Linux Planet Open Networks Today ServerWatch VoIP Planet WebVideoUniverse Wi-Fi Planet WinDrivers.com Network Map Developer internet.com/Developer 15 Seconds 4GuysFromRolla.com ASP101 CodeGuru Developer.com DevX FlashKit.com Gamelan JARS JavaScript.com JavaScriptSource PHPBuilder.com ScriptSearch VB Forums VB Wire WebDeveloper.com Webreference Network Map News Internetnews.com Linux Today Network Map Small Business Ecommerce Gui

In [38]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

In [40]:
# use later
parameters = {
    'vect__preprocessor': clean_text_delicious,
    "vect__max_features": MAX_NB_WORDS
}

In [43]:
scores = cross_val_score(pipeline, data, binary_label_data, cv=5,scoring='f1_micro',verbose=0)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


In [44]:
scores

array([ 0.17236631,  0.14499143,  0.15507355,  0.16070994,  0.16326531])