## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../src')
sys.path.append(src_dir)

In [5]:
%aimport data.delicious_t140
%aimport features.delicious_t140
%aimport helpers.files,helpers.labels

ImportError: No module named 'helpers.files'

In [None]:
from data.movielens_20m_imdb import load_df_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows
from utils.metrics import ranking

In [None]:
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
NB_DOCS = 3000

MIN_TAG_DF = 10

In [None]:
np.random.seed(SEED)

In [None]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [None]:
docs_df

In [None]:
# remove this for production
docs_df = sample_rows(docs_df,NB_DOCS)

In [None]:
docs_df.head()

In [None]:
docs_df.describe()

In [None]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [None]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

In [None]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', TfidfVectorizer()),
    # https://stackoverflow.com/a/39712590/436721
    ('clf', OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()),n_jobs=-1)),
])

parameters = [
    { 
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [None]:
best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)
       
#     current_score = val_score
    
    print("train micro-F1 @1: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=1)))
    print("validation micro-F1 @1: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=1)))
    
    print("train micro-F1 @5: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=5)))
    print("validation micro-F1 @5: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=5)))    
    
    print("train micro-F1 @10: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=10)))
    print("validation micro-F1 @10: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=10)))

    
    print("grid: {}".format(g))
    print("")
    
#     if current_score > best_score:
#         best_score = current_score
#         best_grid = g