## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [3]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.segments
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [4]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [5]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-ovr-linear-svc-calibrated/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

In [6]:
np.random.seed(SEED)

In [7]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [8]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [9]:
mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))

np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [10]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [11]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC(),cv=2),n_jobs=-1)),
])

parameters = [
    { 
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [12]:
%%time

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)

CPU times: user 2min 4s, sys: 808 ms, total: 2min 5s
Wall time: 2min 36s


In [13]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

train micro-F1 @1: 0.1353592256119332
validation micro-F1 @1: 0.043681508161544945
train micro-F1 @2: 0.23370427297878882
validation micro-F1 @2: 0.06702241195304162
train micro-F1 @3: 0.3051396660883154
validation micro-F1 @3: 0.08392537016134387
train micro-F1 @4: 0.3568524689161356
validation micro-F1 @4: 0.09398145266695712
train micro-F1 @5: 0.39440454300720257
validation micro-F1 @5: 0.10566391378199497
train micro-F1 @6: 0.4221776245666932
validation micro-F1 @6: 0.11184246916311742
train micro-F1 @7: 0.44231416996677136
validation micro-F1 @7: 0.11674089599161645
train micro-F1 @8: 0.45608356627924085
validation micro-F1 @8: 0.12055149071723657
train micro-F1 @9: 0.4654235359125135
validation micro-F1 @9: 0.12361947196283832
train micro-F1 @10: 0.4717660310183763
validation micro-F1 @10: 0.12686060715739944
CPU times: user 13.8 s, sys: 2.57 s, total: 16.4 s
Wall time: 16.4 s


In [14]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

train micro-F1 @1: 0.9458510441692848
validation micro-F1 @1: 0.4415182029434547
train micro-F1 @2: 0.908899622208407
validation micro-F1 @2: 0.3793797825211438
train micro-F1 @3: 0.8738400789733465
validation micro-F1 @3: 0.3463013698630137
train micro-F1 @4: 0.8407093135511406
validation micro-F1 @4: 0.3159656831973216
train micro-F1 @5: 0.8099808061420346
validation micro-F1 @5: 0.30411328388401887
train micro-F1 @6: 0.7822157382532425
validation micro-F1 @6: 0.28693060876968923
train micro-F1 @7: 0.7565438096572772
validation micro-F1 @7: 0.2731731240804316
train micro-F1 @8: 0.7321274763135228
validation micro-F1 @8: 0.26158332433308135
train micro-F1 @9: 0.7093887094138499
validation micro-F1 @9: 0.2517860590847654
train micro-F1 @10: 0.6885396668467703
validation micro-F1 @10: 0.2446344442505671
CPU times: user 13.8 s, sys: 2.6 s, total: 16.4 s
Wall time: 16.4 s
