## ovr-svm

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score,average_precision_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression,SGDClassifier

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours, helpers.segments
%aimport utils.dataframes, utils.clusters, utils.metrics

In [4]:
from data.movielens_20m_imdb import load_df_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows
from utils.metrics import ranking

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
NB_DOCS = 3000

MIN_TAG_DF = 10

In [6]:
np.random.seed(SEED)

In [7]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [8]:
docs_df

Unnamed: 0,movie_id,title,synopsis,tags,num_tags
0,1,Toy Story (1995),A boy called Andy Davis (voice: John Morris) u...,"disney,unlikely-friendships,fanciful,light,cgi...",62
1,2,Jumanji (1995),The film begins in 1869 in the town of Brantfo...,"scary,not-for-kids,dynamic-cgi-action,childish...",25
2,6,Heat (1995),An inbound Blue Line train pulls in to Firesto...,"great-acting,al-pacino,need-to-own,dumbed-down...",66
3,7,Sabrina (1995),"Sabrina Fairchild (Julia Ormond), is the Larra...","chick-flick,based-on-a-play,clv,drama,paris,no...",13
4,8,Tom and Huck (1995),The film opens with Injun Joe (Eric Schweig) a...,"based-on-a-book,adapted-frombook,seen,library-...",4
5,10,GoldenEye (1995),"The story opens in 1986, in the Cold War Sovie...","sequel,bobola,007,good-dialogue,assassin,bond,...",35
6,11,"American President, The (1995)","Michael Douglas is President Andrew Shepherd, ...","annette-bening,girlie-movie,great-story,white-...",26
7,12,Dracula: Dead and Loving It (1995),Transylvania: 1893. A coach containing Thomas ...,"bd-r,leslie-nielsen,vampire,gothic,mel-brooks,...",6
8,13,Balto (1995),"Nome, Alaska, 1925. A diphtheria epidemic is s...","ei-muista,wolves",2
9,15,Cutthroat Island (1995),"In 1668 Jamaica, Morgan Adams (Geena Davis) a ...","big-budget,sword-fight,pirates,humor,treasure,...",12


In [9]:
# remove this for production
docs_df = sample_rows(docs_df,NB_DOCS)

In [10]:
docs_df.head()

Unnamed: 0,movie_id,title,synopsis,tags,num_tags
0,3970,"Beyond, The (E tu vivrai nel terrore - L'aldil...","New Orleans, Louisiana, 1927. An enraged posse...","lovecraftian-mythology,lucio-fulci,ominous,bd-...",18
1,49769,Something Wild (1961),Adapted from the 1958 novel Mary Ann by Alex K...,bd-r,1
2,5464,Road to Perdition (2002),"Michael Sullivan Sr., is an enforcer to John R...","organized-crime,honor,holes00s,bad-ending,osca...",43
3,6395,"Crazies, The (a.k.a. Code Name: Trixie) (1973)","One night in the rural town of Evans City, Pen...","dvd-r,less-than-300-ratings,paranoid,confronta...",10
4,26812,Barbarians at the Gate (1993),This first appeared on www.realmoviereview.com...,"less-than-300-ratings,reviewed,business-is-the...",5


In [11]:
docs_df.describe()

Unnamed: 0,movie_id,num_tags
count,3000.0,3000.0
mean,41063.467333,13.773333
std,39526.745551,16.54436
min,13.0,1.0
25%,3966.5,3.0
50%,27855.0,8.0
75%,74685.75,18.0
max,131082.0,222.0


In [12]:
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [13]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

data = docs_df['synopsis'].values
indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 931 
total number of train documents: 2550
total number of validation documents: 450


In [14]:
# https://github.com/scikit-learn/scikit-learn/issues/6614
class MyTfidfVectorizer(TfidfVectorizer):
    def fit_transform(self, X, y):
        result = super(MyTfidfVectorizer, self).fit_transform(X, y)
        result.sort_indices()
        return result

In [15]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

pipeline = Pipeline([
    ('vect', MyTfidfVectorizer()),
    # https://stackoverflow.com/a/26496300/436721
    ('clf', OneVsRestClassifier(SVC(kernel='linear',probability=True),n_jobs=-1)),
])

parameters = [
    { 
        "vect__max_features": [MAX_NB_WORDS]
    }
]

In [16]:
best_score = float("-inf")

for g in ParameterGrid(parameters):
    pipeline.set_params(**g)
    
    pipeline.fit(X_train,Y_train)
    
    Y_pred_train = pipeline.predict_proba(X_train)    
    Y_pred_val = pipeline.predict_proba(X_val)
       
#     current_score = val_score
    
    print("train micro-F1 @1: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=1)))
    print("validation micro-F1 @1: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=1)))
    
    print("train micro-F1 @5: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=5)))
    print("validation micro-F1 @5: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=5)))    
    
    print("train micro-F1 @10: {}".format(ranking.micro_f1_at_k(Y_train,Y_pred_train,k=10)))
    print("validation micro-F1 @10: {}".format(ranking.micro_f1_at_k(Y_val,Y_pred_val,k=10)))

    
    print("grid: {}".format(g))
    print("")
    
#     if current_score > best_score:
#         best_score = current_score
#         best_grid = g

train micro-F1 @1: 0.09473641059276872
validation micro-F1 @1: 0.024060785141409878
train micro-F1 @5: 0.2907005211065645
validation micro-F1 @5: 0.06228042159054615
train micro-F1 @10: 0.32636162085637205
validation micro-F1 @10: 0.07460164171897633
grid: {'vect__max_features': 5000}

