## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours, helpers.segments
%aimport utils.dataframes, utils.clusters

In [5]:
from data.movielens_20m_imdb import load_df_or_get_from_cache
from helpers.labels import truncate_labels,filter_tag
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows

In [23]:
filter_tag("the king of the mor_ons-- das")

'the-king-of-the-mor_ons-das'

In [None]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS

MAX_NB_WORDS = 20000
MIN_LABEL_DF = int(20)

# for sampling
NB_DOCS = 1500

In [None]:
docs_df = load_or_get_from_cache(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,INTERIM_DATA_ROOT)

In [None]:
# remove this for production
docs_df = sample_rows(docs_df,NB_DOCS)

In [None]:
docs_df.head()

In [None]:
docs_df.describe()

In [None]:
truncated_labels = truncate_labels(docs_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values,MIN_LABEL_DF)

In [None]:
truncated_labels

In [None]:
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(truncated_labels)
print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['plot'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

In [None]:
# good order (OVR just for the SVM, of course!)
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=MAX_NB_WORDS)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

parameters = [
    { 
          "clf__estimator__penalty": ["l2"],
          "clf__estimator__dual":[False,True],
          "clf__estimator__multi_class":["crammer_singer","ovr"],
          "clf__estimator__tol": [0.001,0.0001],
          "vect__max_features": [MAX_NB_WORDS]  
    },
    { 
          "clf__estimator__penalty": ["l1"],
          "clf__estimator__dual":[False],
          "clf__estimator__multi_class":["crammer_singer","ovr"],
          "clf__estimator__tol": [0.001,0.0001],
          "vect__max_features": [MAX_NB_WORDS]  
    }    
]

In [None]:
best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict(X_train)    
    Y_pred_val = pipeline.predict(X_val)
    
    train_score = f1_score(Y_train,Y_pred_train,average='micro')
    val_score = f1_score(Y_val,Y_pred_val,average='micro')
    
    current_score = val_score
    
    print("train micro-F1: {}".format(train_score))
    print("val micro-F1: {}".format(val_score))
    print("grid: {}".format(g))
    print("")
    
    if current_score > best_score:
        best_score = current_score
        best_grid = g

In [None]:
print(best_score,best_grid)