In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.delicious_t140
%aimport features.delicious_t140
%aimport helpers.labels,helpers.neighbours, helpers.delicious_t140

In [4]:
from data.delicious_t140 import load_or_get_from_cache,make_sample_with_contents_or_get_from_cache
from features.delicious_t140 import clean_text_delicious
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours

from helpers.delicious_t140 import make_path_to_file

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/delicious-t140/")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140/"
TAGINFO=DATA_ROOT+"taginfo.xml"

# CONFIGS
MAX_NB_WORDS = 4000
NB_NEIGHBOURS = 3
DISTANCE_METRIC='cosine'
WEIGHTS='distance'
PREPROC=None
STOP_WORDS=None
NORM='l2'
PREPROC=clean_text_delicious

In [6]:
docs_df = load_or_get_from_cache(TAGINFO,INTERIM_DATA_ROOT)

In [7]:
sample_df = make_sample_with_contents_or_get_from_cache(docs_df,INTERIM_DATA_ROOT,DATA_ROOT)

In [8]:
sample_df.head(5)

Unnamed: 0,filename,filetype,hash,unique_tags,url,num_users,num_unique_tags,contents
0,a6a80f81c0d3a4f8849e5cc449d65150.html,html,a6a80f81c0d3a4f8849e5cc449d65150,"quality,programming,process,testing,productivi...",http://www.developer.com/java/other/article.ph...,179,24,\n<HTML>\n<head>\n<TITLE>Effective Code Review...
1,3114064706973e8342a5eeae9f84d6c4.html,html,3114064706973e8342a5eeae9f84d6c4,"mysql,class,php",http://www.webmaster-talk.com/blog/17/php-mysq...,4,3,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T..."
2,bf356f62f1888390bba62c71d7aaeb9b.html,html,bf356f62f1888390bba62c71d7aaeb9b,"dmca,canada,law,censorship,riaa,copyright",http://www.boingboing.net/2008/06/11/canadian-...,10,6,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<!...
3,0be0a4647229c10b8c93ffaadb60fece.html,html,0be0a4647229c10b8c93ffaadb60fece,"vegan,thai,recipes,tofu,peanut,vegetarian,vege...",http://blog.fatfreevegan.com/2008/06/tofu-and-...,11,7,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S..."
4,6bbba01a6e0adc99fbb0550119fde611.html,html,6bbba01a6e0adc99fbb0550119fde611,"scripting,digital,design,studio,architect,firm...",http://www.evandouglis.com/,30,14,"<?xml version=""1.0"" encoding=""iso-8859-1""?>\n<..."


In [9]:
data = sample_df['contents'].values
labelsets = sample_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values

In [10]:
mlb = MultiLabelBinarizer()
mlb.fit(labelsets)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [11]:
# I can't put this into a pipeline because NearestNeighbors is not a normal classifier, I think
# I need to customize the pipeline object to be able to call the methods for that class.
vect = CountVectorizer(preprocessor=PREPROC,max_features=MAX_NB_WORDS,stop_words=STOP_WORDS)
tfidf = TfidfTransformer(norm=NORM)
nbrs = NearestNeighbors(n_neighbors=NB_NEIGHBOURS, metric=DISTANCE_METRIC)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data,labelsets,test_size=0.25)

In [13]:
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [14]:
# train
X_train = vect.fit_transform(X_train)
X_train = tfidf.fit_transform(X_train)
nbrs.fit(X_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [15]:
# test
X_test = vect.transform(X_test)
X_test = tfidf.transform(X_test)

In [16]:
X_train.shape,X_test.shape

((2155, 4000), (719, 4000))

In [17]:
y_train.shape,y_test.shape

((2155, 6881), (719, 6881))

In [18]:
y_preds = []
y_trues = []

distances_matrix, indices_matrix = nbrs.kneighbors(X_test)

neighbour_labels_tensor = y_train[indices_matrix]    

distances_matrix.shape, indices_matrix.shape, neighbour_labels_tensor.shape

((719, 3), (719, 3), (719, 3, 6881))

In [19]:
for i in tqdm(range(distances_matrix.shape[0])):
          
    distances = distances_matrix[i].ravel()  
        
    neighbour_labels = neighbour_labels_tensor[i]
       
    y_pred = get_predicted_labels_from_neighbours(neighbour_labels, distances)
    
    y_true = y_test[i]
    
    y_preds.append(y_pred)
    y_trues.append(y_true)
    
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

100%|██████████| 719/719 [00:18<00:00, 38.05it/s]


In [20]:
f1_score(y_trues,y_preds,average='micro')

0.12032729022942402