## ovr-svm

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.externals import joblib
from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours, helpers.segments
%aimport utils.dataframes, utils.clusters, utils.metrics

In [4]:
from data.movielens_20m_imdb import load_df_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows
from utils.metrics import ranking

In [17]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
MIN_TAG_DF = 10

In [7]:
np.random.seed(SEED)

In [8]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [9]:
docs_df.head()

Unnamed: 0,movie_id,title,synopsis,tags,num_tags
0,1,Toy Story (1995),A boy called Andy Davis (voice: John Morris) u...,"warm,very-good,3d,heroic-mission,kids-and-fami...",59
1,2,Jumanji (1995),The film begins in 1869 in the town of Brantfo...,"time,dynamic-cgi-action,saturn-award-best-spec...",19
2,6,Heat (1995),An inbound Blue Line train pulls in to Firesto...,"slick,relationships,who-cares-dvds,cliched,gun...",57
3,7,Sabrina (1995),"Sabrina Fairchild (Julia Ormond), is the Larra...","romance,chick-flick,great-cast,greg-kinnear,no...",13
4,8,Tom and Huck (1995),The film opens with Injun Joe (Eric Schweig) a...,"adapted-frombook,based-on-a-book,seen",3


In [10]:
docs_df.describe()

Unnamed: 0,movie_id,num_tags
count,6710.0,6710.0
mean,41263.124888,12.214605
std,39409.134389,14.369509
min,1.0,1.0
25%,4106.25,3.0
50%,31251.0,7.0
75%,74531.5,16.0
max,131082.0,189.0


In [11]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [12]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [13]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [14]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', MyTfidfVectorizer()),
    # https://stackoverflow.com/a/26496300/436721
    ('clf', OneVsRestClassifier(SVC(kernel='linear',probability=True),n_jobs=-1)),
])

parameters = [
    { 
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [15]:
%%time

best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)
       


NameError: name 'val_score' is not defined

In [21]:
joblib.dump(pipeline,open(MODELS_ROOT+"/movielens-ovr-svc.p","wb"))

In [16]:
   
ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

train micro-F1 @1: 0.07568990042674253
validation micro-F1 @1: 0.02296958294136093
train micro-F1 @2: 0.13827226156817737
validation micro-F1 @2: 0.035206599396037416
train micro-F1 @3: 0.18838501768328486
validation micro-F1 @3: 0.04561232990835879
train micro-F1 @4: 0.22830046227002398
validation micro-F1 @4: 0.05143642431778025
train micro-F1 @5: 0.2589944619949608
validation micro-F1 @5: 0.05584415584415584
train micro-F1 @6: 0.2819562755428628
validation micro-F1 @6: 0.060041043682204635
train micro-F1 @7: 0.2989723637997706
validation micro-F1 @7: 0.06240258294366511
train micro-F1 @8: 0.31138777919012167
validation micro-F1 @8: 0.06515745944226486
train micro-F1 @9: 0.31981800568188
validation micro-F1 @9: 0.06658240647118302
train micro-F1 @10: 0.32489772917014526
validation micro-F1 @10: 0.06819060506475932


In [None]:
ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))