In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.movielens_20m_imdb
%aimport helpers.labels,helpers.neighbours

In [4]:
from data.movielens_20m_imdb import load_or_get_from_cache
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS
MAX_NB_WORDS = 4000
NB_NEIGHBOURS = 3
DISTANCE_METRIC='cosine'
WEIGHTS='distance'

In [6]:
docs_df = load_or_get_from_cache(PATH_TO_MOVIES,PATH_TO_TAG_ASSIGNMENTS,PATH_TO_MOVIE_PLOTS,INTERIM_DATA_ROOT)

In [7]:
data = docs_df['plot'].values
labelsets = docs_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values

In [8]:
mlb = MultiLabelBinarizer()
mlb.fit(labelsets)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [9]:
# I can't put this into a pipeline because NearestNeighbors is not a normal classifier, I think
# I need to customize the pipeline object to be able to call the methods for that class.
vect = CountVectorizer(max_features=MAX_NB_WORDS)
tfidf = TfidfTransformer()
nbrs = NearestNeighbors(n_neighbors=NB_NEIGHBOURS, metric=DISTANCE_METRIC)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(data,labelsets,test_size=0.25)

In [11]:
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [12]:
# train
X_train = vect.fit_transform(X_train)
X_train = tfidf.fit_transform(X_train)
nbrs.fit(X_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=3, p=2, radius=1.0)

In [13]:
# test
X_test = vect.transform(X_test)
X_test = tfidf.transform(X_test)

In [14]:
X_train.shape,X_test.shape

((4734, 4000), (1578, 4000))

In [15]:
y_train.shape,y_test.shape

((4734, 16787), (1578, 16787))

In [16]:
y_preds = []
y_trues = []

distances_matrix, indices_matrix = nbrs.kneighbors(X_test)

neighbour_labels_tensor = y_train[indices_matrix]    

distances_matrix.shape, indices_matrix.shape, neighbour_labels_tensor.shape

((1578, 3), (1578, 3), (1578, 3, 16787))

In [17]:
for i in tqdm(range(distances_matrix.shape[0])):
          
    distances = distances_matrix[i].ravel()  
        
    neighbour_labels = neighbour_labels_tensor[i]
       
    y_pred = get_predicted_labels_from_neighbours(neighbour_labels, distances)
    
    y_true = y_test[i]
    
    y_preds.append(y_pred)
    y_trues.append(y_true)
    
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

100%|██████████| 1578/1578 [01:47<00:00, 15.84it/s]


In [22]:
f1_score(y_trues,y_preds,average='micro')

0.044062064849811032