## mimlsvm

mi = mulit-instance
ml = multi-label
svm = svm

As described in Shen et al 2009: http://ieeexplore.ieee.org/document/5346261/

> Should we use SVM-struct instead? https://github.com/pystruct/pystruct


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import nltk
import os
import re
import pickle
import sklearn
import sys
import string

from hausdorff import hausdorff

from nltk import TextTilingTokenizer
from scipy.spatial.distance  import directed_hausdorff, pdist

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler,MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.svm import SVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours, helpers.segments
%aimport utils.dataframes, utils.clusters

In [4]:
from data.movielens_20m_imdb import load_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from utils.dataframes import sample_rows
from utils.clusters import k_medoids

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS

MAX_NB_WORDS = 300
PREPROC=None
STOP_WORDS='english'
VECTORIZER_NORM = 'l2'

# for sampling
NB_DOCS = 1000

# Pseudosentence size (in words)
W=20 # not specified in the paper, taken from TextTiling default values

#  Size (in sentences) of the block used in the block comparison method
K=10 # not specified in the paper, taken from TextTiling default values

MIN_LABEL_DF = 5 # like in the paper

SAMPLE_TO_NB_MEDOIDS_RATIO = 0.2 # not specified in the paper, but taken from MIMLSVM canonical implementation
SVM_KERNEL='poly' # not specified in the paper, but taken from MIMLSVM canonical implementation
SVM_GAMMA=0.2 # not specified in the paper, but taken from MIMLSVM canonical implementation
SVM_C= 1# not specified in the paper, but taken from MIMLSVM canonical implementation
SVM_DEGREE=4

In [6]:
docs_df = load_or_get_from_cache(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,INTERIM_DATA_ROOT)

In [7]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [8]:
# remove this for production
docs_df = sample_rows(docs_df,NB_DOCS)

In [9]:
docs_df['sentences'] = docs_df['plot'].map(lambda row: sentence_tokenizer.tokenize(row))

In [10]:
%%time
docs_df['num_sentences'] = docs_df['sentences'].map( lambda sents: len(sents))

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.14 ms


In [11]:
docs_df.head()

Unnamed: 0,index,movie_id,title,unique_tags,num_users,num_unique_tags,plot,sentences,num_sentences
0,5849,99178,Struck by Lightning (2012),"writers,teen,adapted-frombook,chris-colfer,reb...",3.0,10,"A high school boy, desperate to escape the idi...","[A high school boy, desperate to escape the id...",4
1,1370,3143,Hell in the Pacific (1968),"toshiro-mifune,john-boorman,toshirô-mifune,ene...",4.0,8,"During World War II, a shot-down American pilo...","[During World War II, a shot-down American pil...",6
2,3083,7478,Swimming to Cambodia (1987),"cambodia,jonathan-demme,stylemonologue,one-man...",8.0,17,Spalding Gray sits behind a desk throughout th...,[Spalding Gray sits behind a desk throughout t...,2
3,11,13,Balto (1995),"sort-of-boring,simon-wells,dogsled,ei-muista,w...",5.0,5,"A half-wolf, half-husky named Balto gets a cha...","[A half-wolf, half-husky named Balto gets a ch...",8
4,4691,60943,Frozen River (2008),"smuggling,trailer-home,financial-problems,frie...",10.0,18,Takes place in the days before Christmas near ...,[Takes place in the days before Christmas near...,12


In [12]:
docs_df.iloc[0]['sentences']

['A high school boy, desperate to escape the idiocy of the people in his hometown, tries to create a way in which he can move to New York, attend the college of his dreams and do something other than live in the footsteps of his drunken, divorced mother.',
 "Along the way he blackmails his fellow students into contributing to his literary magazine and discovers what it's like to feel accomplished.",
 'Does he get accepted into the college of his dreams?',
 'Is he going to make a difference and follow his life goal?']

In [13]:
docs_df.describe()

Unnamed: 0,index,movie_id,num_users,num_unique_tags,num_sentences
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3196.091,34120.533,16.031,11.74,10.887
std,1815.731887,36774.641119,27.787446,8.359562,7.838474
min,0.0,1.0,2.0,2.0,1.0
25%,1605.25,3714.25,3.0,4.0,5.0
50%,3270.5,8550.0,6.0,9.0,9.0
75%,4693.25,61018.75,15.0,20.0,15.0
max,6307.0,128671.0,308.0,25.0,41.0


In [14]:
tok = TextTilingTokenizer(w=W, k=K)

In [15]:
def extract_segments(candidates):
    
    try:
        # we must manually insert "\n\n" because this is how 
        # texttilingtokenizer requires candidate boundaries to be 
        # represented.
        segments = tok.tokenize("\n\n".join(candidates))
    except ValueError:
        # this happens when the candidate list is too small for the 
        # text tiling tokenizer to be able to find segments. so just return
        # the original sentences.
        segments= candidates
        
    # now remove the artificially added chars
    segments = [segment.replace("\n\n"," ").strip() for segment in segments]
    
    return segments

In [16]:
%%time
docs_df['segments'] = docs_df['sentences'].map(lambda candidates: extract_segments(candidates))

CPU times: user 13.7 s, sys: 0 ns, total: 13.7 s
Wall time: 13.7 s


In [17]:
docs_df['segments'][0]

["A high school boy, desperate to escape the idiocy of the people in his hometown, tries to create a way in which he can move to New York, attend the college of his dreams and do something other than live in the footsteps of his drunken, divorced mother. Along the way he blackmails his fellow students into contributing to his literary magazine and discovers what it's like to feel accomplished. Does he get accepted into the college of his dreams? Is he going to make a difference and follow his life goal?"]

In [18]:
segments = docs_df['segments'].values
documents = docs_df['plot'].values
labelsets = truncate_labels(docs_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values,MIN_LABEL_DF)

In [19]:
# I can't put this into a pipeline because NearestNeighbors is not a normal classifier, I think
# I need to customize the pipeline object to be able to call the methods for that class.

# TFIDF_VECTORIZER = COUNT_VECTORIZER + TFIDF_TRANSFORMER
tfidf_vectorizer = TfidfVectorizer(max_features=MAX_NB_WORDS, preprocessor=PREPROC, stop_words=STOP_WORDS,norm=VECTORIZER_NORM)
# segments => k-medoids
clf = OneVsRestClassifier(SVC(kernel=SVM_KERNEL,gamma=SVM_GAMMA,C=SVM_C,degree=SVM_DEGREE),n_jobs=4)

In [20]:
segments_train, segments_test, documents_train, documents_test, y_train, y_test = train_test_split(segments,
                                                                                                   documents,
                                                                                                   labelsets,
                                                                                                   test_size=0.25)

In [21]:
# the binarizer needs to be fit on all labels
mlb = MultiLabelBinarizer()
mlb.fit(labelsets)

y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [22]:
# total number of individual tags
len(mlb.classes_)

433

In [23]:
y_train.shape,y_test.shape

((750, 433), (250, 433))

## train

In [24]:
# train
tfidf_vectorizer.fit(documents_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=300, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
tfidf_segments_train = vectorize_segments(segments_train, tfidf_vectorizer)

In [26]:
tfidf_segments_train.shape

(750,)

In [27]:
tfidf_segments_train[0]

<6x300 sparse matrix of type '<class 'numpy.float64'>'
	with 67 stored elements in Compressed Sparse Row format>

In [28]:
%%time

# so that we know is the saved file refers to a sample or to the full file
if NB_DOCS is None:
    print("NB_DOCS is None")
    path_to_cache = INTERIM_DATA_ROOT.rstrip('/') + "/mimlsvm/distance-matrix-train.p"
else:
    print("NB_DOCS is: {}".format(NB_DOCS))
    path_to_cache = INTERIM_DATA_ROOT.rstrip('/') + "/mimlsvm/distance-matrix-train-sample-{}.p".format(NB_DOCS)
    
if os.path.isfile(path_to_cache):
    dist_matrix_train = pickle.load(open(path_to_cache,"rb"))
else:      
    dist_matrix_train = make_distance_matrix_for_segments(tfidf_segments_train)
    pickle.dump(dist_matrix_train, open(path_to_cache, "wb"))

NB_DOCS is: 1000
CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 3.27 ms


In [29]:
dist_matrix_train.shape

(750, 750)

In [30]:
NB_MEDOIDS = int(len(tfidf_segments_train) * SAMPLE_TO_NB_MEDOIDS_RATIO)
medoids_indices_train = k_medoids(dist_matrix_train,NB_MEDOIDS)[0]

In [31]:
medoids = tfidf_segments_train[medoids_indices_train]

In [32]:
medoids.shape

(150,)

In [33]:
def make_train_dataset(distance_matrix, medoid_indices):
    """
    Returns a matrix where element Aij contains the distance from sample i to medoid j.

    :param distance_matrix: MxM matrix with pairwise distances
    :param medoid_indices: array of length N containing the indices of the medoids for each cluster
    :return: distances to medoids (MxN matrix)
    """

    return distance_matrix[:,medoid_indices]

In [34]:
medoids_indices_train

array([  1,  11,  14,  25,  33,  34,  35,  39,  43, 228,  46, 641,  50,
        51,  53,  55,  65,  66,  75,  79,  81,  83,  84,  85,  87,  97,
       107, 111, 116, 122, 131, 134,  37, 139, 143, 144, 145, 331, 152,
       748, 155, 156, 158, 176, 164, 182, 184, 195, 201,  63, 222, 223,
       224,  92, 226, 236, 247, 262, 265, 274, 275, 276, 280, 283, 303,
       248, 314, 316, 160, 731, 693, 342, 347, 330, 356, 359, 362, 364,
       374, 381, 193, 394, 396, 403, 411, 413, 417, 420, 438, 440, 441,
       447, 649, 186, 461, 191, 210, 474, 475, 476, 480, 481, 135, 471,
       497, 498, 515, 527, 528, 545, 549, 227, 554, 556, 404, 566, 576,
       398, 112, 591, 595, 392, 598, 601, 608, 610, 615,  16, 620, 363,
       623, 629, 632, 637, 643, 644,   9, 659,  88, 671, 675, 676, 680,
       704, 706, 318, 733, 739, 741, 631])

In [35]:
X_train = make_train_dataset(dist_matrix_train,medoids_indices_train)

## test

In [36]:
# tfidf has been fit on the training set
tfidf_segments_test = vectorize_segments(segments_test, tfidf_vectorizer)

In [37]:
def make_test_dataset(source_vectorized_segments, medoid_vectorized_segments):
    """
    Calculates the distances from every source_document (reprsented by its segments) to every medoid
    document (also represented by its segments) using the hausdorff distance.
    
    Returns a matrix where element Aij contains the distance from sample i to medoid j.

    :param source_vectorized_segments: array of length M, where each element is a matrix with one row
        for every segment in a source document
    :param medoid_vectorized_segments: array of length N where each element is a matrix with one row
        for every segment in a medoid document
    :return: distances to medoids (MxN matrix)
    """
    
    num_test_samples = len(source_vectorized_segments)
    num_medoids = len(medoid_vectorized_segments)
    
    test_dataset = np.zeros((num_test_samples,num_medoids))    
    
    for i,source_segments in enumerate(source_vectorized_segments):
        for j,medoid_segments in enumerate(medoid_vectorized_segments):
            test_dataset[i][j] = hausdorff(source_segments.toarray(),medoid_segments.toarray())
            
    return np.array(test_dataset)
            

In [38]:
X_test = make_test_dataset(tfidf_segments_test,medoids)

## running

In [None]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

In [None]:
X_train_norm[350]

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_preds = clf.predict(X_test)
y_trues = y_test

In [None]:
y_preds[0].shape

In [None]:
np.allclose(y_preds[77],np.zeros(y_preds.shape[1]))

In [None]:
f1_score(y_trues,y_preds,average='micro')