## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.externals import joblib
from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [3]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.segments
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [4]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [5]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-ovr-svc/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
MIN_TAG_DF = 10

In [6]:
np.random.seed(SEED)

In [7]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [8]:
docs_df.head()

Unnamed: 0,movie_id,title,synopsis,tags,num_tags
0,1,Toy Story (1995),A boy called Andy Davis (voice: John Morris) u...,"buy,want-to-see-again,unlikely-friendships,inn...",59
1,2,Jumanji (1995),The film begins in 1869 in the town of Brantfo...,"childish,robin-williams,time,not-for-kids,adap...",19
2,6,Heat (1995),An inbound Blue Line train pulls in to Firesto...,"rviolence,soundtrack,suspense,dialogue,bibliot...",57
3,7,Sabrina (1995),"Sabrina Fairchild (Julia Ormond), is the Larra...","based-on-a-play,clv,remake,relationships,chick...",13
4,8,Tom and Huck (1995),The film opens with Injun Joe (Eric Schweig) a...,"based-on-a-book,adapted-frombook,seen",3


In [9]:
docs_df.describe()

Unnamed: 0,movie_id,num_tags
count,6710.0,6710.0
mean,41263.124888,12.214605
std,39409.134389,14.369509
min,1.0,1.0
25%,4106.25,3.0
50%,31251.0,7.0
75%,74531.5,16.0
max,131082.0,189.0


In [10]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [11]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [12]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [13]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [14]:
%%time

if os.path.isfile(MODELS_ROOT+"/model.p"):
    pipeline=joblib.load(open(MODELS_ROOT+"/model.p","rb"))
        
else:
    pipeline = Pipeline([
        ('vect', MyTfidfVectorizer()),
        # https://stackoverflow.com/a/26496300/436721
        ('clf', OneVsRestClassifier(SVC(kernel='linear',probability=True),n_jobs=-1)),
    ])

    parameters = [
        { 
            "vect__max_features": [MAX_NB_WORDS]
        }
    ]
    
    for g in ParameterGrid(parameters):
        pipeline.set_params(**g)
    
        pipeline.fit(X_train,Y_train)
    
        Y_pred_train = pipeline.predict_proba(X_train)    
        Y_pred_val = pipeline.predict_proba(X_val)
        
    joblib.dump(pipeline,open(MODELS_ROOT+"/model.p","wb"))    

CPU times: user 10.9 s, sys: 4.09 s, total: 15 s
Wall time: 33 s


In [15]:
%%time

if os.path.isfile(MODELS_ROOT+"/y-pred-train.p"):
    Y_pred_train=joblib.load(open(MODELS_ROOT+"/y-pred-train.p","rb"))
else:
    Y_pred_train = pipeline.predict_proba(X_train)    

    joblib.dump(Y_pred_train,open(MODELS_ROOT+"/y-pred-train.p","wb"))

CPU times: user 48 ms, sys: 112 ms, total: 160 ms
Wall time: 1.44 s


In [16]:
%%time

if os.path.isfile(MODELS_ROOT+"/y-pred-val.p"):
    Y_pred_val=joblib.load(open(MODELS_ROOT+"/y-pred-val.p","rb"))
else:
    Y_pred_val = pipeline.predict_proba(X_val)    

    joblib.dump(Y_pred_val,open(MODELS_ROOT+"/y-pred-val.p","wb"))

CPU times: user 0 ns, sys: 36 ms, total: 36 ms
Wall time: 453 ms


In [17]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

train micro-F1 @1: 0.07550631471157128
validation micro-F1 @1: 0.022648902821316614
train micro-F1 @2: 0.1378104232249038
validation micro-F1 @2: 0.03490170090567705
train micro-F1 @3: 0.1876876467056062
validation micro-F1 @3: 0.04474195338512764
train micro-F1 @4: 0.22747127653941865
validation micro-F1 @4: 0.05123004449097095
train micro-F1 @5: 0.2579004536451345
validation micro-F1 @5: 0.05525681438902281
train micro-F1 @6: 0.28052184391407525
validation micro-F1 @6: 0.05948195030473512
train micro-F1 @7: 0.2975913434845782
validation micro-F1 @7: 0.061752433936022255
train micro-F1 @8: 0.30981205649640664
validation micro-F1 @8: 0.06431106637707262
train micro-F1 @9: 0.31793874046216386
validation micro-F1 @9: 0.06625897098958859
train micro-F1 @10: 0.3231303151575788
validation micro-F1 @10: 0.06782936373737862
CPU times: user 10.9 s, sys: 420 ms, total: 11.3 s
Wall time: 11.3 s


In [18]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

train micro-F1 @1: 0.9307503506311361
validation micro-F1 @1: 0.28727634194831014
train micro-F1 @2: 0.863429172510519
validation micro-F1 @2: 0.2355864811133201
train micro-F1 @3: 0.8037050023375409
validation micro-F1 @3: 0.21371769383697814
train micro-F1 @4: 0.7531995091164095
validation micro-F1 @4: 0.19458250497017893
train micro-F1 @5: 0.7076437587657785
validation micro-F1 @5: 0.17773359840954275
train micro-F1 @6: 0.6666082281439926
validation micro-F1 @6: 0.16815772034459908
train micro-F1 @7: 0.6309356842316169
validation micro-F1 @7: 0.15762567452428286
train micro-F1 @8: 0.5989437237026648
validation micro-F1 @8: 0.15084493041749503
train micro-F1 @9: 0.5697950755804894
validation micro-F1 @9: 0.1447978793903247
train micro-F1 @10: 0.5435659186535764
validation micro-F1 @10: 0.13956262425447316
CPU times: user 10.9 s, sys: 392 ms, total: 11.3 s
Wall time: 11.3 s
