## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours, helpers.segments
%aimport utils.dataframes, utils.clusters

In [4]:
from data.movielens_20m_imdb import load_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows

In [16]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS

MAX_NB_WORDS = 20000
MIN_LABEL_DF = int(20)

# for sampling
NB_DOCS = 1500

In [17]:
docs_df = load_or_get_from_cache(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,INTERIM_DATA_ROOT)

In [18]:
# remove this for production
docs_df = sample_rows(docs_df,NB_DOCS)

In [19]:
docs_df.head()

Unnamed: 0,movie_id,title,unique_tags,num_users,num_unique_tags,plot
0,73256,Heart of Midnight (1988),"bizarre,compelling,interesting-story,betamax",2.0,4,Carol inherits a night club from her weird unc...
1,6281,Phone Booth (2002),"colin-farrell,interesting,kiefer-sutherland,di...",40.0,25,Stu Shepard is a fast talking and wise crackin...
2,6326,City of Ghosts (2002),"gerard-depardieu,gérard-depardieu,matt-dillon",2.0,3,A con man flees to Southeast Asia when an inte...
3,87876,Cars 2 (2011),"plot,not-funny,pixar,bobola,michael-caine,owen...",11.0,19,The famous race car Lightning McQueen and his ...
4,80947,Bran Nue Dae (2009),"geoffrey-rush,australia,road-trip",2.0,3,In the Summer of 1969 a young man is filled wi...


In [20]:
docs_df.describe()

Unnamed: 0,movie_id,num_users,num_unique_tags
count,1500.0,1500.0,1500.0
mean,33582.076,14.996667,11.819333
std,37208.50344,25.735207,8.27867
min,1.0,2.0,2.0
25%,3750.75,3.0,4.0
50%,7940.0,6.0,9.0
75%,63045.25,15.0,19.0
max,128981.0,317.0,25.0


In [21]:
truncated_labels = truncate_labels(docs_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values,MIN_LABEL_DF)

In [22]:
truncated_labels

[['betamax'],
 ['crime', 'new-york-city', 'serial-killer', 'bad-acting'],
 [],
 ['not-funny', 'bobola', 'sequel'],
 [],
 [],
 [],
 ['suspense',
  'nudity-rear',
  'nudity-topless',
  'silly',
  'thriller',
  "can't-remember",
  'clv'],
 ['action'],
 ['documentary'],
 ['less-than-300-ratings', 'crime', 'religion'],
 ['surreal',
  'scifi',
  'cinematography',
  'romance',
  'coming-of-age',
  'soundtrack',
  'nudity-topless'],
 ['betamax', 'reviewed'],
 [],
 ['drama'],
 ["can't-remember", 'clv'],
 ['animation', 'fantasy', 'funny', 'horror'],
 ['netflix', 'to-see'],
 ['dark-comedy'],
 ['nudity-topless',
  'suspense',
  'dvdvideo',
  'reviewed',
  "erlend's-dvds",
  "tumey's-dvds"],
 ['parody',
  'hilarious',
  'scifi',
  'comedy',
  'funny',
  'classic',
  'seen-at-the-cinema'],
 ['high-school',
  'based-on-a-book',
  'social-commentary',
  'seen-at-the-cinema',
  "erlend's-dvds",
  'bibliothek'],
 [],
 ['politics'],
 ['murder', 'action', 'tense', 'military', 'campy', 'crime'],
 ['stupid'

In [23]:
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(truncated_labels)
print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['plot'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 110 
total number of train documents: 1275
total number of validation documents: 225


In [24]:
# good order (OVR just for the SVM, of course!)
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=MAX_NB_WORDS)),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

parameters = [
    { 
          "clf__estimator__penalty": ["l2"],
          "clf__estimator__dual":[False,True],
          "clf__estimator__multi_class":["crammer_singer","ovr"],
          "clf__estimator__tol": [0.001,0.0001],
          "vect__max_features": [MAX_NB_WORDS]  
    },
    { 
          "clf__estimator__penalty": ["l1"],
          "clf__estimator__dual":[False],
          "clf__estimator__multi_class":["crammer_singer","ovr"],
          "clf__estimator__tol": [0.001,0.0001],
          "vect__max_features": [MAX_NB_WORDS]  
    }    
]

In [25]:
best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict(X_train)    
    Y_pred_val = pipeline.predict(X_val)
    
    train_score = f1_score(Y_train,Y_pred_train,average='micro')
    val_score = f1_score(Y_val,Y_pred_val,average='micro')
    
    current_score = val_score
    
    print("train micro-F1: {}".format(train_score))
    print("val micro-F1: {}".format(val_score))
    print("grid: {}".format(g))
    print("")
    
    if current_score > best_score:
        best_score = current_score
        best_grid = g

train micro-F1: 0.9987018606663781
val micro-F1: 0.028328611898016998
grid: {'clf__estimator__tol': 0.001, 'clf__estimator__dual': False, 'clf__estimator__penalty': 'l2', 'vect__max_features': 20000, 'clf__estimator__multi_class': 'crammer_singer'}

train micro-F1: 0.9987018606663781
val micro-F1: 0.028328611898016998
grid: {'clf__estimator__tol': 0.0001, 'clf__estimator__dual': False, 'clf__estimator__penalty': 'l2', 'vect__max_features': 20000, 'clf__estimator__multi_class': 'crammer_singer'}

train micro-F1: 0.9997118155619595
val micro-F1: 0.0
grid: {'clf__estimator__tol': 0.001, 'clf__estimator__dual': False, 'clf__estimator__penalty': 'l2', 'vect__max_features': 20000, 'clf__estimator__multi_class': 'ovr'}

train micro-F1: 0.9997118155619595
val micro-F1: 0.0
grid: {'clf__estimator__tol': 0.0001, 'clf__estimator__dual': False, 'clf__estimator__penalty': 'l2', 'vect__max_features': 20000, 'clf__estimator__multi_class': 'ovr'}

train micro-F1: 0.9987018606663781
val micro-F1: 0.028

In [26]:
print(best_score,best_grid)

0.028328611898 {'clf__estimator__tol': 0.001, 'clf__estimator__dual': False, 'clf__estimator__penalty': 'l2', 'vect__max_features': 20000, 'clf__estimator__multi_class': 'crammer_singer'}
