In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import pickle
import sklearn
import sys

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, Normalizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.model_selection import GridSearchCV

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

In [3]:
%aimport data.load_delicious
%aimport features.build_features
%aimport helpers.files,helpers.labels

In [8]:
from data.load_delicious import load_taginfo_into_dataframe
from helpers.files import get_directory_name_from_hash
from helpers.labels import truncate_labels
from features.preprocess import clean_text_delicious

In [9]:
ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140/"
TAGINFO = ROOT+"taginfo.xml"
INTERIM_DATA_ROOT = "../data/interim/delicious-t140/"
MAX_NB_WORDS = 2000
LABELS_MIN_DOC_FRACTION = 0.01

In [11]:
if os.path.isfile(INTERIM_DATA_ROOT+"docs_df.p"):
    docs_df = pickle.load( open( INTERIM_DATA_ROOT+"docs_df.p", "rb" ))
else:
    docs_df = load_taginfo_into_dataframe(TAGINFO)
    pickle.dump(docs_df,open(INTERIM_DATA_ROOT+"docs_df.p","wb"))

In [12]:
num_documents = len(docs_df)
num_documents

143716

In [None]:
docs_df.head(10)

In [13]:
# TODO optimize this because currently this does one I/O OP per loop
def load_contents(hash):
    file_path = ROOT+"fdocuments/"+get_directory_name_from_hash(hash)+"/"+hash+".html"
       
    with open(file_path,"r",encoding='utf-8', errors='ignore') as f:
        contents = f.read()
        
    return contents

In [14]:
%%time

if os.path.isfile(INTERIM_DATA_ROOT+"sample_df.p"):
    sample_df = pickle.load(open(INTERIM_DATA_ROOT+"sample_df.p", "rb"))
else:
    random_indices = np.random.choice(docs_df.index.values, int(num_documents/50), replace=False)
    sample_df = docs_df.loc[random_indices]
    sample_df = sample_df.reset_index().drop(['index'],axis=1)
    sample_df['contents'] = sample_df['hash'].map(lambda hash: load_contents(hash))
    pickle.dump(sample_df,open(INTERIM_DATA_ROOT+"sample_df.p","wb"))

CPU times: user 84 ms, sys: 64 ms, total: 148 ms
Wall time: 145 ms


In [15]:
num_documents = len(sample_df)
num_documents

2874

In [16]:
sample_df.head(10)

Unnamed: 0,filename,filetype,hash,tags,url,num_users,num_tags,contents
0,37296e86ac599d7fe9aaff17a3037400.html,html,37296e86ac599d7fe9aaff17a3037400,"flash,air,adobe,blog,flex,design,webdesign,act...",http://www.webkitchen.be/,65,23,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
1,0b294ed63a75b9d264af303f5c5eeea9.html,html,0b294ed63a75b9d264af303f5c5eeea9,"french,language,franÃ§ais,resources,education,...",http://www.uni.edu/becker/french31.html,123,21,<HTML>\n<HEAD>\n\n<TITLE>Best French Websites<...
2,5d7ec003095ae0f3123b2e3da4b3fb95.html,html,5d7ec003095ae0f3123b2e3da4b3fb95,"music,mp3,history,free,audio,downloads,songs,d...",http://www.foldedspace.org/weblog/2006/06/in_t...,566,25,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
3,8ca4efc75f6399dfe7e4bffbfc0c08d4.html,html,8ca4efc75f6399dfe7e4bffbfc0c08d4,"ssh,sftp,security,linux,database,backup,howto,...",http://ask-leo.com/how_can_i_automate_an_sftp_...,21,10,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
4,ce9a31b508d0505b0e4cb2bbf4fe3d55.html,html,ce9a31b508d0505b0e4cb2bbf4fe3d55,"ajax,rails,tutorial",http://railsonedge.blogspot.com/2008/03/tutori...,7,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S..."
5,1bcbe8e8838244aca41bfbe69a958f24.html,html,1bcbe8e8838244aca41bfbe69a958f24,"silverlight,charts,opensource,chart,.net,free,...",http://www.visifire.com/silverlight_charts_gal...,47,15,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
6,8384c9cae634c9dff115a741926e0fc9.html,html,8384c9cae634c9dff115a741926e0fc9,"parenting,education,baby,children,learning,lan...","http://www.time.com/time/health/article/0,8599...",121,25,<!--[if IE 5]> Vignette StoryServer 5.0 Fri Ma...
7,2ad1b840589b35378c8757739a461b1d.html,html,2ad1b840589b35378c8757739a461b1d,"language,dictionary,swear,fun,reference,funny,...",http://www.youswear.com/index.asp,18,14,
8,78f632abdab2116c806b8ab81029fce7.html,html,78f632abdab2116c806b8ab81029fce7,"history,image,compression,lenna,fun,internet,i...",http://www.cs.cmu.edu/~chuck/lennapg/,58,25,<html>\n\n<head>\n<title>The Rest of the Lenna...
9,8c1fd30e1547407eab96254252d433d4.html,html,8c1fd30e1547407eab96254252d433d4,"science,space,astronomy",http://dvice.com/archives/2008/06/astronomers_...,4,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."


In [17]:
tag_sets = sample_df["tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [18]:
len(all_tags)

7201

In [19]:
min_nb_docs = int(len(sample_df)* LABELS_MIN_DOC_FRACTION)
min_nb_docs

28

In [20]:
sample_df["tags_split"] = sample_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = sample_df["tags_split"].values

In [21]:
truncated_labels = truncate_labels(labels,min_nb_docs)

In [22]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [23]:
binary_label_data.shape

(2874, 242)

In [24]:
data = sample_df["contents"].values

In [25]:
data[0][:1000]

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"\n"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml">\n<head profile="http://gmpg.org/xfn/11">\n<title>Serge Jespers</title>\n<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon" />\n<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />\n<meta http-equiv="pragma" content="no-cache" />\n<meta http-equiv="cache-control" content="no-cache" />\n<link rel="stylesheet" type="text/css" href="http://www.webkitchen.be/wp-content/themes/smashingtheme/style.css" />\n<script type="text/javascript" src="http://www.webkitchen.be/wp-content/themes/smashingtheme/javascript/imghover.js"> </script>\n<link rel="alternate" type="application/rss+xml" title="RSS 2.0" href="http://www.webkitchen.be/feed/" />\n<link rel="alternate" type="text/xml" title="RSS .92" href="http://www.webkitchen.be/feed/rss/" />\n<link rel="alternate" type="application/atom+xml" title

In [42]:
pipeline = Pipeline([
    ('vect', CountVectorizer(preprocessor=clean_text_delicious, max_features=MAX_NB_WORDS)),
    ('tf',TfidfTransformer()),
    ('lda',LatentDirichletAllocation()),
    ('norm',Normalizer()),
    ('clf', OneVsRestClassifier(KNeighborsClassifier(),n_jobs=-1)),
])

In [47]:
# use later
parameters = {
    "tf__use_idf":[False],
    "tf__norm":['l1','l2',None],
    "lda__n_components":[2,5,10,20],
    "lda__max_iter":[5,10],
    "lda__learning_method":["online"],
    "norm__norm":["l2"],
    "clf__estimator__p":[1,2],
    "clf__estimator__n_neighbors": [2,3,4,5],
    "clf__estimator__weights":['distance']
}

In [48]:
%time

# start with minus infinity as your
# current best_score
best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    X_train, X_validation, y_train, y_validation = train_test_split(data, binary_label_data, test_size = 0.25)
    
    # here you call fit with whatever data you want
    pipeline.fit(X_train,y_train)
    
    # again, choose the validation data 
    # yourself
    y_pred_train = pipeline.predict(X_train)    
    y_pred_validation = pipeline.predict(X_validation)
    
    # I've used f1-score as an example, but you can use
    # any metric you want.
    train_score = f1_score(y_train,y_pred_train, average='micro')
    val_score = f1_score(y_validation,y_pred_validation, average='micro')
    
    current_score = val_score
    
    # show results
    print("training F1: {}".format(train_score))
    print("validation F1: {}".format(val_score))
    print("grid: {}".format(g))
    print("")
    
    # update the best_score if needed
    if current_score > best_score:
        best_score = current_score
        best_grid = g

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.58 µs
training F1: 0.12061489948758375
validation F1: 0.0142602495543672
grid: {'lda__n_components': 2, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l1', 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

training F1: 0.13228761037678935
validation F1: 0.012896657050738164
grid: {'lda__n_components': 2, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l2', 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

training F1: 0.12325243261380159
validation F1: 0.016190318850156944
grid: {'lda__n_components': 2, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': None, 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

train

training F1: 0.9965402016737145
validation F1: 0.07671377161414904
grid: {'lda__n_components': 5, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l1', 'norm__norm': 'l2', 'clf__estimator__weights': 'distance', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

training F1: 0.9964337292581567
validation F1: 0.07005972011765754
grid: {'lda__n_components': 5, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l2', 'norm__norm': 'l2', 'clf__estimator__weights': 'distance', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

training F1: 0.9973061957497755
validation F1: 0.07944793417676724
grid: {'lda__n_components': 5, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': None, 'norm__norm': 'l2', 'clf__estimator__weights': 'distance', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 1}

training F1: 0.9961314129270324
validation F1: 0.07506372505933023
gri

training F1: 0.1331720459668994
validation F1: 0.018483655656340924
grid: {'lda__n_components': 10, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l1', 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 2}

training F1: 0.13348933088194326
validation F1: 0.021093623235437287
grid: {'lda__n_components': 10, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': 'l2', 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 2}

training F1: 0.2032233963069698
validation F1: 0.04321382842509604
grid: {'lda__n_components': 10, 'tf__use_idf': False, 'lda__learning_method': 'online', 'tf__norm': None, 'norm__norm': 'l2', 'clf__estimator__weights': 'uniform', 'clf__estimator__n_neighbors': 2, 'lda__max_iter': 5, 'clf__estimator__p': 2}

training F1: 0.1216071930317505
validation F1: 0.014504697544090984

KeyboardInterrupt: 