see http://krex.k-state.edu/dspace/bitstream/handle/2097/9785/RahulChoubey2011.pdf

This is an implementation of the first method of the two described above.

Extract topics from the dataset using LDA then, for each test document, find out what are the most significant topics for it. Then they take the top 5 most likely words for the most likely topic and use those as recommendations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.externals import joblib
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [None]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.topics
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [None]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments
from src.helpers.topics import get_word_weight_dict

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [None]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-topics/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-13.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
MIN_TAG_DF = 10

STOP_WORDS='english' # using stopwords since most people using LDA do this
NB_COMPONENTS = [50,100,200,400]

In [None]:
np.random.seed(SEED)

In [None]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [None]:
docs_df.head()

In [None]:
docs_df.describe()

In [None]:
data = docs_df['synopsis'].values
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [None]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

In [None]:
vect = CountVectorizer(max_features=MAX_NB_WORDS, stop_words=STOP_WORDS)
# it's ok to fit in the whole data since this is not training a model
vect.fit(data)

X_train_vect = vect.transform(X_train)
X_val_vect = vect.transform(X_val)

In [None]:
%%time

lda = dict()

for nb_comp in NB_COMPONENTS:

    if os.path.isfile(MODELS_ROOT+"/lda-{}.p".format(nb_comp)):
        lda[nb_comp]=joblib.load(open(MODELS_ROOT+"/lda-{}.p".format(nb_comp),"rb"))
    
    else:
        lda[nb_comp] = LatentDirichletAllocation(n_components=nb_comp, learning_method='online')
        lda[nb_comp].fit(X_train_vect)
        joblib.dump(lda[nb_comp],open(MODELS_ROOT+"/lda-{}.p".format(nb_comp),"wb"))

In [None]:
def train_and_score(nb_comp):
    
    vocabulary = vect.get_feature_names()
    tag_vocabulary = mlb.classes_
    
    model = lda[nb_comp]
    
    X_train = model.transform(X_train_vect)
    X_val = model.transform(X_val_vect)
    
    # training
    
    Y_pred_train = []

    for (i,test_document_topics_distr) in enumerate(X_train):

        # 'word' => weight in topic
        word_weight_dict = get_word_weight_dict(model,0,vocabulary)

        # only keep words that are valid tags
        valid_tag_elems = [(k,v) for (k,v) in word_weight_dict.items() if k in tag_vocabulary]

        valid_word_weight_dict = dict(valid_tag_elems)
        valid_words = valid_word_weight_dict.keys()

        # make a dummy y_score out of that (the numbers aren't probabililties
        # but we're only interested in the ranking, not absolute numbers)
        y_scores_dummy = [valid_word_weight_dict[tag_value] if tag_value in valid_words else 0.0 for (tag_index,tag_value) in enumerate(tag_vocabulary)]

        y_scores_dummy = np.array(y_scores_dummy)

        Y_pred_train.append(y_scores_dummy)

    Y_pred_train = np.array(Y_pred_train)
    
    
    # validation
    
    Y_pred_val = []

    for (i,test_document_topics_distr) in enumerate(X_val):

        # 'word' => weight in topic
        word_weight_dict = get_word_weight_dict(model,0,vocabulary)

        # only keep words that are valid tags
        valid_tag_elems = [(k,v) for (k,v) in word_weight_dict.items() if k in tag_vocabulary]

        valid_word_weight_dict = dict(valid_tag_elems)
        valid_words = valid_word_weight_dict.keys()

        # make a dummy y_score out of that (the numbers aren't probabililties
        # but we're only interested in the ranking, not absolute numbers)
        y_scores_dummy = [valid_word_weight_dict[tag_value] if tag_value in valid_words else 0.0 for (tag_index,tag_value) in enumerate(tag_vocabulary)]

        y_scores_dummy = np.array(y_scores_dummy)

        Y_pred_val.append(y_scores_dummy)

    Y_pred_val = np.array(Y_pred_val)
    
    # scoring
    
    print("scoring for nb_comp={}".format(nb_comp))
    
    print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)
    
    ks = [1,2,3,4,5,6,7,8,9,10]

    for k in ks:
        print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
        print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))
    
    for k in ks:
        print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
        print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))
        
    print("\n")    

In [None]:
for nb_comp in NB_COMPONENTS:
    train_and_score(nb_comp)