## ovr-svm

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import gc
import os
import re
import pickle
import sklearn
import sys
import string


from datetime import datetime
from sklearn.externals import joblib
from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [6]:
%aimport src.data.delicious_t140
%aimport src.helpers.labels
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics,src.utils.plotting

In [7]:
from src.features.delicious_t140 import clean_text_delicious
from src.data.delicious_t140 import get_sample_from_cache
from src.helpers.labels import truncate_labels
from src.utils.metrics import ranking
from src.utils.dataframes import sample_rows
from src.utils.plotting import plot_micro_f1_at_k

In [8]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/delicious-ovr-linear-svc-calibrated")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/delicious-t140/")
OUTPUT_FILE = 'output-linear-svc-'+ datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.txt'

MAX_NB_WORDS = 500
SEED= 42
MIN_TAG_DF=10
SAMPLE_FRAC=20

In [9]:
np.random.seed(SEED)

In [10]:
docs_df = get_sample_from_cache(INTERIM_DATA_ROOT,SAMPLE_FRAC)

In [11]:
docs_df.head(1)

Unnamed: 0,filename,filetype,hash,tags,url,num_users,num_tags,contents
0,26313806abe42032de4f612017738426.html,html,26313806abe42032de4f612017738426,"tutoriel,multimedia,english,screencasts,videos...",http://screencasts.ubuntu.com/,228,23,"Ubuntu Screencasts @import ""/misc/drupal.css""..."


In [12]:
len(docs_df)

28743

In [13]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))
labels = truncate_labels(labels,MIN_TAG_DF)

In [14]:
np.random.seed(SEED)

mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['contents'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 3305 
total number of train documents: 24432
total number of validation documents: 4311


In [16]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(),cv=2),n_jobs=-1)),
])

parameters = [
    {
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [17]:
%%time

for g in ParameterGrid(parameters):
    print(g)
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)
    
    ks = [1,2,3,4,5,6,7,8,9,10]
        
    # k is the number of neighbors so let's use at_k
    for k in ks:
        print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True))) 

{'vect__max_features': 500}
validation micro-F1 @1: 0.7574578469520103
validation micro-F1 @2: 0.7055025898956535
validation micro-F1 @3: 0.6647739004749122
validation micro-F1 @4: 0.6334350358600468
validation micro-F1 @5: 0.6067481093659104
validation micro-F1 @6: 0.5827237609928495
validation micro-F1 @7: 0.5605666722316297
validation micro-F1 @8: 0.5414869322506978
validation micro-F1 @9: 0.5231805724725944
validation micro-F1 @10: 0.506676827684154
CPU times: user 7min 46s, sys: 2.79 s, total: 7min 49s
Wall time: 8min 17s


In [18]:
pickle.dump(pipeline,open(MODELS_ROOT+'/model-sample-{}.p'.format(SAMPLE_FRAC),'wb'))

### e sem calibrar?

In [19]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline2 = Pipeline([
    ('vect', TfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

parameters = [
    {
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [20]:
%%time

for g in ParameterGrid(parameters):
    print(g)
    pipeline2.set_params(**g)
    
    pipeline2.fit(X_train,Y_train)

{'vect__max_features': 500}
CPU times: user 5min 40s, sys: 1.56 s, total: 5min 42s
Wall time: 5min 46s


In [22]:
Y_pred_val = pipeline2.decision_function(X_val)

In [25]:
np.set_printoptions(edgeitems=1000)
Y_pred_val[0]

array([-0.95304567, -1.12416228, -1.27864622, -0.85497278, -1.31861831,
       -1.03088412, -1.16110081, -1.2490947 , -1.41852517, -1.8885244 ,
       -1.06295711, -1.19435278, -1.1262626 , -1.52296861, -1.16339093,
       -1.21965673, -0.73542541, -1.34699391, -1.48132038, -1.34406803,
       -1.20993766, -1.18292769, -1.57105899, -1.35470617, -1.41016004,
       -1.25377624, -1.11116244, -1.52937452, -1.50462086, -1.32454724,
       -1.27149891, -0.99929117, -1.15050754, -1.23261674, -1.5252132 ,
       -1.79204699, -1.52471997, -1.1062714 , -1.17997948, -1.33805049,
       -1.50880209, -1.40580951, -1.21997362, -1.39104758, -1.21341711,
       -1.54479662, -1.40777569, -1.06913252, -1.50931203, -1.29165048,
       -0.94869557, -1.33223996, -1.3979408 , -1.13489393, -1.42016154,
       -1.39133982, -1.66083136, -1.78306138, -1.4061084 , -1.1750573 ,
       -1.60023845, -1.29892362, -1.35646622, -1.05491478, -1.62205528,
       -1.5265862 , -1.15539357, -0.89994505, -1.22005458, -1.27

In [29]:
np.abs(Y_pred_val[0])

array([0.95304567, 1.12416228, 1.27864622, 0.85497278, 1.31861831,
       1.03088412, 1.16110081, 1.2490947 , 1.41852517, 1.8885244 ,
       1.06295711, 1.19435278, 1.1262626 , 1.52296861, 1.16339093,
       1.21965673, 0.73542541, 1.34699391, 1.48132038, 1.34406803,
       1.20993766, 1.18292769, 1.57105899, 1.35470617, 1.41016004,
       1.25377624, 1.11116244, 1.52937452, 1.50462086, 1.32454724,
       1.27149891, 0.99929117, 1.15050754, 1.23261674, 1.5252132 ,
       1.79204699, 1.52471997, 1.1062714 , 1.17997948, 1.33805049,
       1.50880209, 1.40580951, 1.21997362, 1.39104758, 1.21341711,
       1.54479662, 1.40777569, 1.06913252, 1.50931203, 1.29165048,
       0.94869557, 1.33223996, 1.3979408 , 1.13489393, 1.42016154,
       1.39133982, 1.66083136, 1.78306138, 1.4061084 , 1.1750573 ,
       1.60023845, 1.29892362, 1.35646622, 1.05491478, 1.62205528,
       1.5265862 , 1.15539357, 0.89994505, 1.22005458, 1.27076736,
       0.99979918, 1.49436085, 1.01628351, 1.63454155, 0.98644

In [31]:
Y_pred_val = pipeline2.decision_function(X_val)

ks = [1,2,3,4,5,6,7,8,9,10]

# k is the number of neighbors so let's use at_k
for k in ks:
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

validation micro-F1 @1: 0.7565618690510528
validation micro-F1 @2: 0.7081210668264909
validation micro-F1 @3: 0.6716992759205053
validation micro-F1 @4: 0.6415094339622641
validation micro-F1 @5: 0.6134251069441318
validation micro-F1 @6: 0.5896016794350991
validation micro-F1 @7: 0.5687929807920322
validation micro-F1 @8: 0.5494616419919246
validation micro-F1 @9: 0.531175468483816
validation micro-F1 @10: 0.5144471820672306


In [None]:
plt.clf()
img = plt.gcf()
ax = plt.gca()
validation_scores = [
    0.7574578469520103,0.7055025898956535,
    0.6647739004749122,0.6334350358600468,
    0.6067481093659104,0.5827237609928495,
    0.5605666722316297,0.5414560910109746,
    0.5231805724725944,0.506676827684154
]
plot_micro_f1_at_k(validation_scores,ax=ax)
plt.gcf().set_size_inches(7,5)
plt.gca().legend_.remove()
plt.show()