## ovr-svm with weighted bag-of-embeddings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from joblib import Parallel, delayed

from tqdm import *

%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [3]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.segments,src.helpers.embeddings
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics,src.utils.plotting

In [4]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments
from src.helpers.embeddings import read_glove_wiki_weighted


from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking
from src.utils.plotting import plot_micro_f1_at_k

In [5]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-ovr-linear-svc-calibrated/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS
SEED= 42

MAX_NB_WORDS = 1000

In [6]:
np.random.seed(SEED)

In [7]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [8]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [9]:
mlb = MultiLabelBinarizer()

binary_labels = mlb.fit_transform(labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))

np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [10]:
# only fit on the train data because we're using the IDF scores to weigh the embeddings
vect = TfidfVectorizer(max_features=MAX_NB_WORDS)
vect.fit(X_train)

feature_names = vect.get_feature_names()
idf = vect.idf_
idf_index = dict(zip(vect.get_feature_names(), idf))

## tokenize

In [11]:
# analyzer = preprocess + tokenize
tokenize_func = vect.build_analyzer()

In [12]:
def tokenize(string):
    return tokenize_func(string)

In [13]:
X_train_tok = Parallel(n_jobs=2)(delayed(tokenize)(doc) for doc in X_train)
X_val_tok = Parallel(n_jobs=2)(delayed(tokenize)(doc) for doc in X_val)

## transform into embeddings

In [14]:
embeddings_index = read_glove_wiki_weighted(
    d=100,
    weight_index=idf_index)

overall, 1000 out of 1000 embeddings were weighted. Total available embeddings: 400000


In [15]:
def build_bag_of_weighted_embeddings(document_tokens):
    out = [embeddings_index[token] for token in document_tokens if token in embeddings_index.keys()]
    
    return np.mean(np.array(out),axis=0)    

In [16]:
X_train_boe = Parallel(n_jobs=2)(delayed(build_bag_of_weighted_embeddings)(tokenized_doc) for tokenized_doc in X_train_tok)
X_train_boe = np.array(X_train_boe)

X_val_boe = Parallel(n_jobs=2)(delayed(build_bag_of_weighted_embeddings)(tokenized_doc) for tokenized_doc in X_val_tok)
X_val_boe = np.array(X_val_boe)

In [17]:
pipeline = Pipeline([
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(SVC(),cv=2),n_jobs=-1)),
])

In [37]:
start=-4
stop=2
num=np.abs([start,stop]).sum() + 1
C = 

In [42]:
num

7

In [None]:
grid_search_parameters = [
    { 
        "clf__estimator__base_estimator__kernel": ['rbf','linear','poly'],
        "clf__estimator__base_estimator__C": np.logspace(-4,2,7),
        "clf__estimator__base_estimator__degree":[2,3,4,5]
    }
]

In [39]:
parameters = [
    { 
        "clf__estimator__base_estimator__kernel": ['rbf'],
        "clf__estimator__base_estimator__C": [1.0]
    }
]

In [41]:
%%time

for g in ParameterGrid(parameters):
    print(g)
    
    pipeline.set_params(**g)
    
    pipeline.fit(X_train_boe,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train_boe)    
    Y_pred_val = pipeline.predict_proba(X_val_boe)
    
    ks = [1,2,3,4,5,6,7,8,9,10]
    
    train_scores = [ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True) for k in ks]
    validation_scores = [ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True) for k in ks]
    
    print(train_scores)
    print(validation_scores)
    

{'clf__estimator__base_estimator__kernel': 'rbf', 'clf__estimator__base_estimator__C': 0.0001}


  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])
  E = np.exp(AB[0] * F + AB[1])


[0.27072911929665, 0.2535211267605634, 0.2319574314201581, 0.21993368441583772, 0.2120803686173714, 0.2048780487804878, 0.19795098614433362, 0.19207606973058636, 0.1872561309345151, 0.18199203187250995]
[0.2655172413793103, 0.23353819139596138, 0.2226148409893993, 0.20876919652793233, 0.2029296177206145, 0.19686333084391336, 0.18878600823045266, 0.18329571106094808, 0.17753623188405798, 0.17190623296383792]
{'clf__estimator__base_estimator__kernel': 'rbf', 'clf__estimator__base_estimator__C': 0.001}
[0.3392051244722667, 0.30238095238095236, 0.2769106837176518, 0.2594499752069268, 0.24600246002460024, 0.23619450099208905, 0.2290821191759253, 0.22139070782662926, 0.21467944147872506, 0.2086552352239181]
[0.30782169890664424, 0.2677572105036591, 0.25246091488129707, 0.23315038419319428, 0.2229288111641053, 0.2123500666370502, 0.20448750637429883, 0.19854504756575267, 0.1924535835496107, 0.18804034582132564]
{'clf__estimator__base_estimator__kernel': 'rbf', 'clf__estimator__base_estimator_

In [None]:
ks = [1,2,3,4,5,6,7,8,9,10]

train_scores = [ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True) for k in ks]
validation_scores = [ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True) for k in ks]

In [None]:
plt.clf()
img = plt.gcf()
ax = plt.gca()
plot_micro_f1_at_k(validation_scores,ax,train_scores)
plt.gcf().set_size_inches(7,5)
plt.show()