# Content based recommender system

Content based systems use meta data such as genre, producer, actor, musician to recommend items say movies or music. Content based systems are based on the idea that if you liked a certain item you are most likely to like something that is similar to it.


## 1) Import libraries

In [1]:
import operator
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
import pandas as pd
import gensim
from pprint import pprint
from scipy.stats import pearsonr as pearsons_correlation
import pyLDAvis.gensim
import numpy as np
import pickle
import warnings
import os.path
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
warnings.filterwarnings("ignore", category=DeprecationWarning) 




## 2) Load the data

In [2]:
lemma = WordNetLemmatizer()    
stemma = PorterStemmer()
moviesName = [] #To be returned.
moviesId = [] #To be returned.
numMovies = 0 #To be returned.
moviesIndexMapping = {}
moviesCorpus = []

In [3]:
#Load and get the movies dataset.
dataextract = pd.read_csv('../data/movies.csv')
numMovies = len(dataextract)
col1 = dataextract['movieId']
col2 = dataextract['title']
col3 = dataextract['genres']
for i in range(numMovies):
    moviesName.append(col2[i])
    moviesId.append(col1[i])
    doc = []
    wordsList = col3[i].split('|')
    for j in range(len(wordsList)):
        word = wordsList[j]
        word = word.lower()
        word = stemma.stem(lemma.lemmatize(word))
        if word in list(stopwords.words('english')):
            continue;
        else:
            doc.append(word)
    moviesCorpus.append(doc)
    moviesIndexMapping[col1[i]] = i

In [4]:
#Load and get the tags dataset.
dataextract = pd.read_csv('../data/tags.csv')
col1 = dataextract['movieId']
col2 = dataextract['tag']
for word in dataextract:
    word = word.lower()
    word = stemma.stem(lemma.lemmatize(word))
    if word in list(stopwords.words('english')):
            continue;
    else:
        j = moviesIndexMapping[col1[i]]
        moviesCorpus[j].append(word)
        

## 3) Build the recommender

This content-based recommender is based on the matrix factorization by using the LDA topic modeling.

In [None]:
#Bag of words. Create a dictionary containing the word and words unique ID.
dictionary = gensim.corpora.Dictionary(moviesCorpus)
dictionary.filter_extremes(no_below=5) #Alter according to dataset.


#For each document a list of tuples is created reporting the words(ID) in filtered dictionary and frequency of those words.
bow_corpus = [dictionary.doc2bow(doc) for doc in moviesCorpus]


#Get max coherence score to get perfect lda model. Get min perplexity to get perfect lda model.
maxCoherence = 0
numTopics = 0
minPerplexity = 0

In [None]:
# Runs the LDA model
# It is first going to check whether the best model has been created already, in which case it loads it. Otherwise it will create it
if os.path.exists('lda_model.pkl'):
      ldamodel = pickle.load(open(ldafolder+'/%s_groups/lda_model_%semails.pkl'%(num_groups,n_sample), 'rb'))  
else:
    for t in range(5, 21):
        lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=t, id2word=dictionary, passes=20, workers=3)
        coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=moviesCorpus, dictionary=dictionary, coherence='c_v')
        coherence_score = coherence_model_lda.get_coherence()
        perplexity = lda_model.log_perplexity(bow_corpus)
        if coherence_score > maxCoherence:
            maxCoherence = coherence_score
            numTopics = t
            minPerplexity = perplexity
        print('Topic: ',t,' Coherence Score: ',coherence_score,' Perplexity: ', perplexity)
        
    #Train bow_corpus.
    lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=numTopics, id2word=dictionary, passes=20, workers=3)
    
    # Save model
    ldafile = open('lda_model.pkl','wb')
    pickle.dump(lda_model,ldafile)
    ldafile.close()



Topic:  5  Coherence Score:  0.48425831688005694  Perplexity:  -2.7489907696893474
Topic:  6  Coherence Score:  0.48425831688005694  Perplexity:  -2.796620909284142
Topic:  7  Coherence Score:  0.4842583168800569  Perplexity:  -2.828833152309498
Topic:  8  Coherence Score:  0.4842583168800568  Perplexity:  -2.835953084298595
Topic:  9  Coherence Score:  0.48425831688005694  Perplexity:  -2.861497600610528
Topic:  10  Coherence Score:  0.4842583168800568  Perplexity:  -2.952712984204523
Topic:  11  Coherence Score:  0.48425831688005694  Perplexity:  -2.94858947465324
Topic:  12  Coherence Score:  0.48425831688005694  Perplexity:  -2.988007333009216
Topic:  13  Coherence Score:  0.48425831688005694  Perplexity:  -3.024767358037005
Topic:  14  Coherence Score:  0.4842583168800569  Perplexity:  -3.0402816760116544
Topic:  15  Coherence Score:  0.4842583168800569  Perplexity:  -3.0488601101197097
Topic:  16  Coherence Score:  0.4842583168800568  Perplexity:  -3.0946468391213324
Topic:  17  

In [None]:
#For each topic, print top 10 significant terms.
pprint(lda_model.print_topics())

[(0,
  '0.859*"comedi" + 0.140*"romanc" + 0.000*"action" + 0.000*"sci-fi" + '
  '0.000*"crime" + 0.000*"horror" + 0.000*"drama" + 0.000*"child" + '
  '0.000*"western" + 0.000*"anim"'),
 (1,
  '0.271*"thriller" + 0.191*"crime" + 0.173*"horror" + 0.129*"drama" + '
  '0.099*"mysteri" + 0.092*"action" + 0.045*"sci-fi" + 0.000*"film-noir" + '
  '0.000*"romanc" + 0.000*"imax"'),
 (2,
  '0.427*"fantasi" + 0.313*"music" + 0.129*"drama" + 0.098*"film-noir" + '
  '0.028*"mysteri" + 0.004*"comedi" + 0.000*"crime" + 0.000*"romanc" + '
  '0.000*"child" + 0.000*"anim"'),
 (3,
  '0.709*"drama" + 0.171*"romanc" + 0.076*"war" + 0.044*"western" + '
  '0.000*"action" + 0.000*"adventur" + 0.000*"comedi" + 0.000*"sci-fi" + '
  '0.000*"crime" + 0.000*"mysteri"'),
 (4,
  '0.242*"documentari" + 0.199*"adventur" + 0.177*"action" + 0.107*"child" + '
  '0.107*"anim" + 0.088*"sci-fi" + 0.049*"(no genres listed)" + 0.016*"imax" + '
  '0.015*"fantasi" + 0.000*"western"')]


We plot the results

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(LDAvis_prepared, 'lda_figure.html')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [None]:
docTermMatrix = np.zeros((numMovies, numTopics)) #To be returned.


#Fill document term matrix
for i in range(len(moviesCorpus)):
    doc = moviesCorpus[i]
    bow_vector = dictionary.doc2bow(doc)
    vec = lda_model[bow_vector]
    #print(doc)
    #print(vec)
    for j in range(len(vec)):
        t = vec[j]
        docTermMatrix[i,j] = t[1];


#It can be seen from docTermmatrix that each movie belongs solely to only one topic, and very few percent of it belongs to other topics.


docMostRelevantTopic = np.argmax(docTermMatrix, axis=1) #ith index stores the index of the most relevant topic in ith document.


numUsers = 0 #To be returned.
usersIndexMapping = {} #To be returned.


#Count number of users.
dataextract = pd.read_csv('../data/ratings.csv')
col1 = dataextract['userId']
for i in range(len(dataextract)):
    if col1[i] not in usersIndexMapping:
        usersIndexMapping[col1[i]] = numUsers
        numUsers += 1


userTermMatrix = np.zeros((numUsers, numTopics)) #To be returned.
userRateFreq = np.zeros((numUsers, numTopics))


#Load and get dataset.
col2 = dataextract['movieId']
col3 = dataextract['rating']
for i in range(len(dataextract)):
    j = usersIndexMapping[col1[i]]
    k = moviesIndexMapping[col2[i]]
    #As each movie solely belongs to only one topic, taking only contribution of that main topic as other topics will only lead to error.
    docMostRelevantTopicIndex = docMostRelevantTopic[k]
    #Here we are not multiplying with rating as, in case of euclid similarity, multiplying by 5(rating) will take user point away from relevant topic(movie point) more that multiplying by 3(rating). So euclid distance will favour movie with rating 3 that movie with rating 5 as according to euclid similarity, less the distance more is similarity.
    if col3[i]>=3:
        userTermMatrix[j,docMostRelevantTopicIndex] += docTermMatrix[k,docMostRelevantTopicIndex]
    else:
        userTermMatrix[j,docMostRelevantTopicIndex] -= docTermMatrix[k,docMostRelevantTopicIndex]
    userRateFreq[j,docMostRelevantTopicIndex] += 1
userRateFreq[userRateFreq == 0] = 1
for i in range(numUsers):
    userTermMatrix[i] /= userRateFreq[i]


#View userTermMatrix and docTermMatrix
file1 = open("test1.txt", "w")
for i in range(numUsers):
    file1.write(str(userTermMatrix[i]))
    file1.write("\n")
file1.close()
file2 = open("test2.txt", "w")
for i in range(numMovies):
    file2.write(str(docTermMatrix[i]))
    file2.write("\n")
file2.close()

In [None]:
#Check and get accurracy. 
file = open("test3.txt", "w")
for i,elem in enumerate(dataextract):
    uid = col1[i]
    mid = col2[i]
    rval = col3[i]
    j = usersIndexMapping[uid]
    uservec = userTermMatrix[j]
    k = moviesIndexMapping[mid]
    docvec = docTermMatrix[k]
    docMostRelevantTopicIndex = docMostRelevantTopic[k]
    #coeff, pval = pearsons_correlation(docvec, uservec) #Pearson's correlation similarity
    #coeff = np.linalg.norm(docvec-uservec) #Euclidean distance similarity
    #As each movie solely belongs to only one topic, so comparing only that main topic with user matrix.
    coeff = abs(docvec[docMostRelevantTopicIndex]-uservec[docMostRelevantTopicIndex]) #Euclidean distance between only relevant topic.
    string = str(uid)+"\t"+str(mid)+"\t"+str(rval)+"\t"+str(coeff)+"\n"
    file.write(string)
file.close()

In [None]:
def compare(x):
    return x[1]
def run():
    uid = int(input("Enter User Id: "))
    if uid not in usersIndexMapping.keys():
        print("User Id not in record.")
        return
    i = usersIndexMapping[uid]
    uservec = userTermMatrix[i]
    #print(uservec)
    recFactor = []
    for i in range(numMovies):
        docvec = docTermMatrix[i]
        docMostRelevantTopicIndex = docMostRelevantTopic[i]
        #coeff, pval = pearsons_correlation(docvec, uservec) #Pearson's correlation similarity
        #coeff = np.linalg.norm(docvec-uservec) #Euclidean distance similarity
        #As each movie solely belongs to only one topic, so comparing only that main topic with user matrix.
        coeff = abs(docvec[docMostRelevantTopicIndex]-uservec[docMostRelevantTopicIndex]) #Euclidean distance between only relevant topic.
        #print(str(moviesName[i])+" "+str(moviesId[i]))
        recFactor.append(tuple((i, coeff)))
    recFactor = sorted(recFactor, key=operator.itemgetter(1), reverse=True)
    numRec = 10
    recommend = []
    for j in range(numRec):
        i = recFactor[j][0]
        recommend.append(tuple((moviesName[i], moviesId[i])))
    print(recommend)

In [None]:
run()