see http://krex.k-state.edu/dspace/bitstream/handle/2097/9785/RahulChoubey2011.pdf

This is an implementation of the first method of the two described above.

Extract topics from the dataset using LDA then, for each test document, find out what are the most significant topics for it. Then they take the top 5 most likely words for the most likely topic and use those as recommendations.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import re
import pickle
import sklearn
import sys
import string

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
from sklearn.model_selection import cross_val_score, GridSearchCV,ParameterGrid, train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NearestNeighbors
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [2]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [3]:
%aimport data.delicious_t140
%aimport features.delicious_t140
%aimport helpers.labels,helpers.neighbours, helpers.delicious_t140

In [4]:
from data.delicious_t140 import load_or_get_from_cache,make_sample_with_contents_or_get_from_cache
from features.delicious_t140 import clean_text_delicious
from helpers.labels import truncate_labels
from helpers.neighbours import get_predicted_labels_from_neighbours
from helpers.topics import get_top_words_for_topic

In [5]:
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/movielens-ml20m-imdb/")
ML_ROOT = "/media/felipe/SAMSUNG/movielens/ml-20m/"
IMDB_ROOT = "/media/felipe/SAMSUNG/imdb/"

PATH_TO_MOVIES = ML_ROOT + "/movies.csv"
PATH_TO_TAG_ASSIGNMENTS = ML_ROOT + "/tags.csv"
PATH_TO_MOVIE_PLOTS = IMDB_ROOT+"/plot.list"

# CONFIGS


INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/delicious-t140/")
DATA_ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140/"
TAGINFO=DATA_ROOT+"taginfo.xml"

# CONFIGS
MAX_NB_WORDS = 4000 # using this because I've used this for other methods too
STOP_WORDS='english' # using stopwords since most people using LDA did this
NB_COMPONENTS = 200
NB_RECOMMENDED_TAGS = 5
PREPROC=clean_text_delicious

In [7]:
docs_df = load_or_get_from_cache(TAGINFO,INTERIM_DATA_ROOT)
sample_df = make_sample_with_contents_or_get_from_cache(docs_df,INTERIM_DATA_ROOT,DATA_ROOT)

In [9]:
data = sample_df['contents'].values
labelsets = sample_df["unique_tags"].map(lambda tagstring: tagstring.split(",")).values

In [10]:
mlb = MultiLabelBinarizer()
mlb.fit(labelsets)

MultiLabelBinarizer(classes=None, sparse_output=False)

In [11]:
# I can't put this into a pipeline because NearestNeighbors is not a normal classifier, I think
# I need to customize the pipeline object to be able to call the methods for that class.
vect = CountVectorizer(max_features=MAX_NB_WORDS, preprocessor=PREPROC, stop_words=STOP_WORDS)
lda = LatentDirichletAllocation(n_components=NB_COMPONENTS, learning_method='online')

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data,labelsets,test_size=0.25)

In [13]:
y_train = mlb.transform(y_train)
y_test = mlb.transform(y_test)

In [14]:
# train
X_train = vect.fit_transform(X_train)
X_train = lda.fit_transform(X_train)

In [15]:
# test
X_test = vect.transform(X_test)
X_test = lda.transform(X_test)

In [16]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2155, 200), (719, 200), (2155, 6881), (719, 6881))

In [17]:
y_preds = []
y_trues = []

for (i,test_document_topics_distr) in enumerate(X_test):
           
    top_topic_index = test_document_topics_distr.argmax()
    
    top_words = get_top_words_for_topic(lda,top_topic_index,vect.get_feature_names(),NB_RECOMMENDED_TAGS)
    
    # some topic words are not valid tags
    top_words_tags = [word for word in top_words if word in mlb.classes_]
   
    binary_label_vector = mlb.transform(np.array([top_words_tags]))
       
    y_pred = binary_label_vector.ravel()
    y_true = y_test[i]
     
    y_preds.append(y_pred)
    y_trues.append(y_true)
    
y_preds = np.array(y_preds)
y_trues = np.array(y_trues)

In [18]:
y_preds.shape,y_trues.shape

((719, 6881), (719, 6881))

In [19]:
f1_score(y_trues,y_preds,average='micro')

0.0075122292103424167