## mimlsvm

As described in Shen et al 2009: http://ieeexplore.ieee.org/document/5346261/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.svm import LinearSVC

from nltk import TextTilingTokenizer

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [None]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [None]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours

In [None]:
from data.movielens_20m_imdb import load_into_dataframe
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours

In [None]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS
MAX_NB_WORDS = 4000
DISTANCE_METRIC='cosine'
PREPROC=None
STOP_WORDS=None
NORM='l2'

In [None]:
docs_df = load_into_dataframe(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,"\n\n")

In [None]:
data = docs_df['plot'].values
labelsets = docs_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values

In [None]:
data[0]

In [None]:
tok = TextTilingTokenizer()
tok.tokenize(data[0])

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(labelsets)

In [None]:
# I can't put this into a pipeline because NearestNeighbors is not a normal classifier, I think
# I need to customize the pipeline object to be able to call the methods for that class.
vect = CountVectorizer(max_features=MAX_NB_WORDS, preprocessor=PREPROC, stop_words=STOP_WORDS)
tfidf = TfidfTransformer(norm=NORM)
nbrs = NearestNeighbors(n_neighbors=NB_NEIGHBOURS, metric=DISTANCE_METRIC)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data,labelsets,test_size=0.25)

In [None]:
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [None]:
# train
X_train = vect.fit_transform(X_train)
X_train = tfidf.fit_transform(X_train)
nbrs.fit(X_train)

In [None]:
# test
X_test = vect.transform(X_test)
X_test = tfidf.transform(X_test)

In [None]:
X_train.shape,X_test.shape

In [None]:
y_train.shape,y_test.shape

In [None]:
y_preds = []
y_trues = []

distances_matrix, indices_matrix = nbrs.kneighbors(X_test)

neighbour_labels_tensor = y_train[indices_matrix]    

distances_matrix.shape, indices_matrix.shape, neighbour_labels_tensor.shape

In [None]:
for i in tqdm(range(distances_matrix.shape[0])):
          
    distances = distances_matrix[i].ravel()  
        
    neighbour_labels = neighbour_labels_tensor[i]
       
    y_pred = get_predicted_labels_from_neighbours(neighbour_labels, distances)
    
    y_true = y_test[i]
    
    y_preds.append(y_pred)
    y_trues.append(y_true)
    
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

In [None]:
f1_score(y_trues,y_preds,average='micro')