see http://krex.k-state.edu/dspace/bitstream/handle/2097/9785/RahulChoubey2011.pdf

This is an implementation of the **second** method of the two described above, namely topic distance.

Extract topics from the dataset using LDA then, for each test document, find out what are the most significant topics for it. Then find out which training set documents have similar topic distributions (as measured by KL-divergence) and propagate the tags from the one most similar document.

> This is the same thing as doing kNN where k=1 and the distance measure between representations is the KL-divergence.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from datetime import datetime
from scipy import stats

from sklearn.externals import joblib
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [None]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [None]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.topics
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [None]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [None]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-topics/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-13.csv')
OUTPUT_FILE = 'output-topic-distances-'+ datetime.now().strftime('%Y-%m-%d-%H-%M-%S')+'.txt'

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
MIN_TAG_DF = 10

# CONFIGS
MAX_NB_WORDS = 5000
NB_NEIGHBOURS = 1
DISTANCE_METRIC= lambda a,b: stats.entropy(a,b)
STOP_WORDS='english' # using stopwords since most people using LDA do this
NB_COMPONENTS = [100,200,400]

In [None]:
np.random.seed(SEED)

In [None]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [None]:
docs_df.head()

In [None]:
docs_df.describe()

In [None]:
data = docs_df['synopsis'].values
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [None]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

In [None]:
vect = CountVectorizer(max_features=MAX_NB_WORDS, stop_words=STOP_WORDS)
# it's ok to fit in the whole data since this is not training a model
vect.fit(data)

X_train_vect = vect.transform(X_train)
X_val_vect = vect.transform(X_val)

In [None]:
lda = dict()

for nb_comp in NB_COMPONENTS:

    if os.path.isfile(MODELS_ROOT+"/lda-{}.p".format(nb_comp)):
        lda[nb_comp]=joblib.load(open(MODELS_ROOT+"/lda-{}.p".format(nb_comp),"rb"))
    
    else:
        lda[nb_comp] = LatentDirichletAllocation(n_components=nb_comp, learning_method='online')
        lda[nb_comp].fit(X_train_vect)
        joblib.dump(lda[nb_comp],open(MODELS_ROOT+"/lda-{}.p".format(nb_comp),"wb"))

In [None]:
# next block is not verified. I need to check that.

In [None]:
def train_and_score(nb_comp):
    nbrs = NearestNeighbors(n_neighbors=NB_NEIGHBOURS, metric=DISTANCE_METRIC)
    
    lda_model = lda[nb_comp]
    
    # train
    X_train = lda_model.transform(X_train_vect)
    X_val = lda_model.transform(X_val_vect)
    
    nbrs.fit(X_train)
    
    y_preds = []
    y_trues = []

    # these are distances from and indices of the nearest neighbor
    # for every point in X_val
    distances_matrix_val, indices_matrix_val = nbrs.kneighbors(X_val)

    
    neighbour_labels_tensor_val = Y_train[indices_matrix_val]    

#     distances_matrix.shape, indices_matrix.shape, neighbour_labels_tensor.shape
    
    
    for i in range(distances_matrix_val.shape[0]):

        distances = distances_matrix_val[i].ravel()  

        neighbour_labels = neighbour_labels_tensor_val[i]

        y_pred = get_predicted_labels_from_neighbours(neighbour_labels, distances)
        y_preds.append(y_pred)

    Y_pred_val = np.array(y_preds)
    
    # training data
    # maybe it'll always be perfect because the query object itself is in the neighbor set
    y_preds = []
    y_trues = []
    
    distances_matrix_train, indices_matrix_train = nbrs.kneighbors(X_train)
    
    neighbour_labels_tensor_train = Y_train[indices_matrix_train]
    
    for i in range(distances_matrix_train.shape[0]):

        distances = distances_matrix_train[i].ravel()  

        neighbour_labels = neighbour_labels_tensor_train[i]

        y_pred = get_predicted_labels_from_neighbours(neighbour_labels, distances)
        y_preds.append(y_pred)

    Y_pred_train = np.array(y_preds)
        
    with open(OUTPUT_FILE,'a+') as f:

        f.write('RESULTS FOR NB_COMP={}\n'.format(nb_comp))

        ks = [1,2,3,4,5,6,7,8,9,10]       

        f.write('NORMALIZED MICRO-F1:\n')    
        for k in ks:
            f.write("train micro-F1 @{}: {}\n".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
            f.write("validation micro-F1 @{}: {}\n".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))

        f.write("\n")  

In [None]:
train_and_score(50)