## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import re
import pickle
import sklearn
import sys
import string


from datetime import datetime
from sklearn.externals import joblib
from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [3]:
%aimport src.data.delicious_t140
%aimport src.helpers.labels
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics


In [4]:
from src.features.delicious_t140 import clean_text_delicious
from src.data.delicious_t140 import load_or_get_from_cache_with_contents
from src.helpers.labels import truncate_labels
from src.utils.metrics import ranking
from src.utils.dataframes import sample_rows

In [5]:
MODELS_ROOT = os.path.abspath("../../../../models/ranking/delicious-t140-ovr-linear-svc-calibrated")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
INTERIM_DATA_ROOT = os.path.abspath("/data/interim/delicious-t140/")
OUTPUT_FILE = 'output-linear-svc-'+ datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.txt'

MAX_NB_WORDS = 5000
SEED= 123
SAMPLING_FACTOR = 0.5

# because we are sampling the rows, we need to reassure the minimum tag_df is still
# 10, as in the full dataset
MIN_TAG_DF = 10

In [6]:
np.random.seed(SEED)

In [7]:
docs_df = load_or_get_from_cache_with_contents(
    source_dataframe=None,
    interim_data_root=INTERIM_DATA_ROOT,
    data_root=None)

In [8]:
# must sample or we'll run out of memory
num_rows = len(docs_df)
num_rows_sample = int(num_rows*SAMPLING_FACTOR)

docs_df = sample_rows(docs_df,num_rows_sample)

In [9]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = truncate_labels(labels,MIN_TAG_DF)

In [10]:
mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['contents'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 5966 
total number of train documents: 61080
total number of validation documents: 10778


In [11]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [12]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', MyTfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(),cv=2),n_jobs=-1)),
])

parameters = [
    {
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [None]:
%%time

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)

In [None]:
Y_pred_train.shape,Y_pred_val.shape

In [None]:
# joblib.dump(pipeline,open(MODELS_ROOT+"/delicious-ovr-svc.p","wb"))

In [None]:
%%time

with open(OUTPUT_FILE,'a+') as f:

    ks = [1,2,3,4,5,6,7,8,9,10]       

    f.write('NORMALIZED MICRO-F1:')    
    for k in ks:
        f.write("train micro-F1 @{}: {}\n".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
        f.write("validation micro-F1 @{}: {}\n".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

    f.write("\n")  