## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.externals import joblib
from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../src')
sys.path.append(src_dir)

In [3]:
%aimport data.delicious_t140
%aimport helpers.labels
%aimport utils.dataframes, utils.clusters, utils.metrics


In [4]:
from features.delicious_t140 import clean_text_delicious
from data.delicious_t140 import get_sample_from_cache
from helpers.labels import truncate_labels
from utils.metrics import ranking

In [5]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
TAGINFO = DATA_ROOT+"/taginfo.xml"
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/delicious-t140/")
MAX_NB_WORDS = 5000
SEED= 42
SAMPLE_FRAC = 0.1

In [6]:
np.random.seed(SEED)

In [7]:
# docs_df = load_or_get_from_cache_with_contents(
#     source_dataframe=None,
#     interim_data_root=INTERIM_DATA_ROOT,
#     data_root=None,
#     sample_frac=SAMPLE_FRAC)
# pickle.dump(docs_df,(open(INTERIM_DATA_ROOT+"/docs_df_with_content_SAMPLE_FRAC_20_SEED_42.p","wb")))

In [8]:
docs_df = get_sample_from_cache(INTERIM_DATA_ROOT,int(SAMPLE_FRAC*100))

In [9]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [10]:
mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['contents'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 8189 
total number of train documents: 12216
total number of validation documents: 2155


In [11]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [12]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', MyTfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(SVC(kernel='linear',probability=True),n_jobs=-1)),
])

parameters = [
    {
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [13]:
%%time

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)

CPU times: user 10h 48min 23s, sys: 2min 16s, total: 10h 50min 40s
Wall time: 20h 9min 19s


In [14]:
joblib.dump(pipeline,open(MODELS_ROOT+"/delicious-ovr-svc.p","wb"))

In [17]:
ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

train micro-F1 @1: 0.07201284876554738
validation micro-F1 @1: 0.055613090306545154
train micro-F1 @2: 0.1398427649632871
validation micro-F1 @2: 0.09897267172497147
train micro-F1 @3: 0.20237738548743128
validation micro-F1 @3: 0.1323920804933463
train micro-F1 @4: 0.2569980680457169
validation micro-F1 @4: 0.1582041020510255
train micro-F1 @5: 0.3033711839824543
validation micro-F1 @5: 0.17774840008412704
train micro-F1 @6: 0.34184233988093526
validation micro-F1 @6: 0.19320214669051877
train micro-F1 @7: 0.3738374240338448
validation micro-F1 @7: 0.20416805048156758
train micro-F1 @8: 0.40025319069171045
validation micro-F1 @8: 0.21198609231095894
train micro-F1 @9: 0.4214683672287228
validation micro-F1 @9: 0.21735699192994068
train micro-F1 @10: 0.4386893471210916
validation micro-F1 @10: 0.22096261910579038
