## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [5]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [6]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.segments
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [7]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [9]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-ovr-linear-svc-calibrated/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

In [10]:
np.random.seed(SEED)

In [11]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [12]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [13]:
mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))

np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [14]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [15]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(),cv=2),n_jobs=-1)),
])

parameters = [
    { 
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [16]:
%%time

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)

In [18]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

train micro-F1 @1: 0.07259265563167525
validation micro-F1 @1: 0.022328423691632716
train micro-F1 @2: 0.13231321879089972
validation micro-F1 @2: 0.034673144876325085
train micro-F1 @3: 0.18003823676967093
validation micro-F1 @3: 0.043800679187746897
train micro-F1 @4: 0.21717615866223897
validation micro-F1 @4: 0.04930773249738767
train micro-F1 @5: 0.24564378361278078
validation micro-F1 @5: 0.055778863397439864
train micro-F1 @6: 0.2675698045229923
validation micro-F1 @6: 0.05923365362081087
train micro-F1 @7: 0.28395595661118384
validation micro-F1 @7: 0.06198875966835457
train micro-F1 @8: 0.2954068991804841
validation micro-F1 @8: 0.06414194915254237
train micro-F1 @9: 0.3032911991057225
validation micro-F1 @9: 0.06588187743141515
train micro-F1 @10: 0.3087001340068034
validation micro-F1 @10: 0.06772619680208686
CPU times: user 11.1 s, sys: 396 ms, total: 11.5 s
Wall time: 11.5 s


In [19]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

train micro-F1 @1: 0.89726507713885
validation micro-F1 @1: 0.28330019880715707
train micro-F1 @2: 0.8330119214586256
validation micro-F1 @2: 0.23409542743538767
train micro-F1 @3: 0.7759467040673211
validation micro-F1 @3: 0.2094102054340623
train micro-F1 @4: 0.7251928471248247
validation micro-F1 @4: 0.18762425447316103
train micro-F1 @5: 0.6806451612903226
validation micro-F1 @5: 0.17932405566600398
train micro-F1 @6: 0.6423270219728845
validation micro-F1 @6: 0.16749502982107356
train micro-F1 @7: 0.6084201562813063
validation micro-F1 @7: 0.15819369497301902
train micro-F1 @8: 0.5774456521739131
validation micro-F1 @8: 0.15047216699801194
train micro-F1 @9: 0.5496532647654667
validation micro-F1 @9: 0.1440247404462116
train micro-F1 @10: 0.5250175315568022
validation micro-F1 @10: 0.1393638170974155
CPU times: user 11 s, sys: 472 ms, total: 11.4 s
Wall time: 11.4 s
