see http://krex.k-state.edu/dspace/bitstream/handle/2097/9785/RahulChoubey2011.pdf

This is an implementation of the first method of the two described above.

Extract topics from the dataset using LDA then, for each test document, find out what are the most significant topics for it. Then they take the top 5 most likely words for the most likely topic and use those as recommendations.

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.externals import joblib
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../../')
sys.path.append(src_dir)

In [3]:
%aimport src.data.movielens_20m_imdb
%aimport src.helpers.labels,src.helpers.neighbours, src.helpers.topics
%aimport src.utils.dataframes, src.utils.clusters, src.utils.metrics

In [4]:
from src.data.movielens_20m_imdb import load_df_or_get_from_cache
from src.helpers.labels import truncate_labels
from src.helpers.neighbours import get_predicted_labels_from_neighbours
from src.helpers.segments import make_distance_matrix_for_segments,vectorize_segments
from src.helpers.topics import get_top_words_for_topic

from src.utils.dataframes import sample_rows
from src.utils.metrics import ranking

In [15]:
MODELS_ROOT = os.path.abspath("../../../models/ranking/movielens-topics/")
INTERIM_DATA_ROOT = os.path.abspath("../../../data/interim/movielens-ml20m-imdb/")
PATH_TO_PROCESSED_FILE = os.path.abspath('../../../data/processed/movielens-20m-imdb-tags-and-synopses-2017-12-20.csv')

# CONFIGS

SEED= 42

MAX_NB_WORDS = 5000

# for sampling
MIN_TAG_DF = 10

STOP_WORDS='english' # using stopwords since most people using LDA do this
NB_COMPONENTS = [50,100,200,400]
NB_RECOMMENDED_TAGS = 5

In [16]:
np.random.seed(SEED)

In [17]:
docs_df = load_df_or_get_from_cache(PATH_TO_PROCESSED_FILE,INTERIM_DATA_ROOT)

In [18]:
docs_df.head()

Unnamed: 0,movie_id,title,synopsis,tags,num_tags
0,1,Toy Story (1995),A boy called Andy Davis (voice: John Morris) u...,"buy,want-to-see-again,unlikely-friendships,inn...",59
1,2,Jumanji (1995),The film begins in 1869 in the town of Brantfo...,"childish,robin-williams,time,not-for-kids,adap...",19
2,6,Heat (1995),An inbound Blue Line train pulls in to Firesto...,"rviolence,soundtrack,suspense,dialogue,bibliot...",57
3,7,Sabrina (1995),"Sabrina Fairchild (Julia Ormond), is the Larra...","based-on-a-play,clv,remake,relationships,chick...",13
4,8,Tom and Huck (1995),The film opens with Injun Joe (Eric Schweig) a...,"based-on-a-book,adapted-frombook,seen",3


In [19]:
docs_df.describe()

Unnamed: 0,movie_id,num_tags
count,6710.0,6710.0
mean,41263.124888,12.214605
std,39409.134389,14.369509
min,1.0,1.0
25%,4106.25,3.0
50%,31251.0,7.0
75%,74531.5,16.0
max,131082.0,189.0


In [20]:
data = docs_df['synopsis'].values
labels = docs_df["tags"].map(lambda tagstring: tagstring.split(","))

In [21]:
mlb = MultiLabelBinarizer()

truncated_labels = truncate_labels(labels,MIN_TAG_DF)

binary_labels = mlb.fit_transform(truncated_labels)

print("total number of unique tags: {} ".format(len(mlb.classes_)))

indices = np.arange(len(data))
np.random.shuffle(indices)

data = [data[i] for i in indices]
targets = binary_labels[indices]
num_validation_samples = int(0.15 * len(data))

X_train = data[:-num_validation_samples]
Y_train = targets[:-num_validation_samples]
X_val = data[-num_validation_samples:]
Y_val = targets[-num_validation_samples:]

print('total number of train documents: {}'.format(len(X_train)))
print('total number of validation documents: {}'.format(len(X_val)))

total number of unique tags: 2138 
total number of train documents: 5704
total number of validation documents: 1006


In [None]:
vect = CountVectorizer(max_features=MAX_NB_WORDS, stop_words=STOP_WORDS)

X_train_vect = vect.fit_transform(X_train)
X_val_vect = vect.transform(X_val)

In [None]:
%%time

for nb_comp in NB_COMPONENTS:

    if os.path.isfile(MODELS_ROOT+"/lda-{}.p".format(nb_comp)):
        lda=joblib.load(open(MODELS_ROOT+"/lda-{}.p".format(nb_comp)))
    
    else:
        lda = LatentDirichletAllocation(n_components=nb_comp, learning_method='online')
        lda.fit(X_train_vect)
        joblib.dump(lda,open(MODELS_ROOT+"/lda-{}.p".format(nb_comp),"wb"))

In [None]:
# train
X_train = lda.transform(X_train_vect)

In [None]:
# val
X_val = lda.transform(X_val_vect)

In [None]:
X_train.shape,X_val.shape,y_train.shape,y_test.shape

In [None]:
y_preds = []
y_trues = []

for (i,test_document_topics_distr) in enumerate(X_test):
           
    top_topic_index = test_document_topics_distr.argmax()
    
    top_words = get_top_words_for_topic(
        lda,
        top_topic_index,
        vect.get_feature_names(),
        NB_RECOMMENDED_TAGS)
    
    # some topic words are not valid tags
    top_words_tags = [word for word in top_words if word in mlb.classes_]
   
    binary_label_vector = mlb.transform(np.array([top_words_tags]))
       
    y_pred = binary_label_vector.ravel()
    y_true = y_test[i]
     
    y_preds.append(y_pred)
    y_trues.append(y_true)
    
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

In [None]:
y_preds.shape,y_trues.shape

In [None]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k)))

In [None]:
%%time

ks = [1,2,3,4,5,6,7,8,9,10]

for k in ks:
    print("train micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_train,Y_pred_train,k=k,normalize=True)))
    print("validation micro-F1 @{}: {}".format(k,ranking.micro_f1_at_k(Y_val,Y_pred_val,k=k,normalize=True)))