In [54]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from itertools import combinations
from scipy import sparse

import random
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

## Data Preprocessing

In [2]:
ratings = pd.read_csv('ml-1m/ratings.dat', header=None, sep='::', 
                      encoding='latin-1', engine='python', 
                      names=['user_id', 'movie_id', 'rating', 'timestamp'])
users = pd.read_csv('ml-1m/users.dat', header=None, sep='::',
                    encoding='latin-1', engine='python',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
movies = pd.read_csv('ml-1m/movies.dat', header=None, sep='::',
                     encoding='latin-1', engine='python',
                     names=['movie_id', 'title', 'genres'])

In [3]:
ratings['user_id'] = ratings['user_id'] - 1
ratings['movie_id'] = ratings['movie_id'] - 1
users['user_id'] = users['user_id'] - 1
movies['movie_id'] = movies['movie_id'] - 1

In [4]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,0,1192,5,978300760
1,0,660,3,978302109
2,0,913,3,978301968
3,0,3407,4,978300275
4,0,2354,5,978824291


In [5]:
ratings.shape

(1000209, 4)

In [6]:
user_n = ratings['user_id'].nunique()
item_n = movies['movie_id'].nunique()
print("Number of users: {}".format(user_n))
print("Number of items: {}".format(item_n))

Number of users: 6040
Number of items: 3883


In [7]:
id_to_iid = {movies['movie_id'][i]:i for i in movies.index}
iid_to_id = {i:movies['movie_id'][i] for i in movies.index}


**Genre popularity**

In [8]:
movies.genres.str.split('|').explode().value_counts().sort_values(ascending=False)

Drama          1603
Comedy         1200
Action          503
Thriller        492
Romance         471
Horror          343
Adventure       283
Sci-Fi          276
Children's      251
Crime           211
War             143
Documentary     127
Musical         114
Mystery         106
Animation       105
Fantasy          68
Western          68
Film-Noir        44
Name: genres, dtype: int64

## Extract Movie Feature Vectors From Genre Information

Use tf-idf, a weighted frequecy, of each genre word as a feature.

In [9]:
tf =  TfidfVectorizer(analyzer=lambda x: (c for i in range(1, 4) for c in combinations(x.split('|'), r=i)))

This vectorizer transforms a movie's genre description into a list of tokens, where each token is a subsequence of the genres.

In [10]:
analyzer = tf.build_analyzer()
[token for token in analyzer('Action|Crime|Drama')]

[('Action',),
 ('Crime',),
 ('Drama',),
 ('Action', 'Crime'),
 ('Action', 'Drama'),
 ('Crime', 'Drama'),
 ('Action', 'Crime', 'Drama')]

Get the weighted frequency of each token and form the feature vector of this movie.

In [11]:
X_tfidf = tf.fit_transform(movies['genres'])
X_tfidf.shape

(3883, 353)

3883 movies, each has a feature vector of length 353 (which is the number of combinations of all the possible genres). Some of the movies are not in the dataset.

## Similarity 

In [12]:
sim = cosine_similarity(X_tfidf)

In [13]:
sim_df = pd.DataFrame(sim, index=movies['title'], columns=movies['title'])
sim_df.sample(5, axis=0).sample(5, axis=1).round(2)

title,"Edge, The (1997)",Metroland (1997),"Man with the Golden Gun, The (1974)",From the Journals of Jean Seberg (1995),Newsies (1992)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Six Degrees of Separation (1993),0.0,0.39,0.0,0.0,0.0
Under the Rainbow (1981),0.0,0.45,0.0,0.0,0.0
"Relic, The (1997)",0.0,0.0,0.0,0.0,0.0
Dead Calm (1989),0.41,0.0,0.0,0.0,0.0
Steel Magnolias (1989),0.0,0.39,0.0,0.0,0.0


## Content-based Filtering

In [14]:
train_df, test_df = train_test_split(ratings, test_size=0.2)

In [15]:
train_df.shape

(800167, 4)

In [18]:
def df_to_mat(df):
    """
    Convert DataFrame to sparse matrix.

    Arg:
        df: DataFrame, ratings dataframe with user_id, movie_id and rating

    Return:
        mat: scipy.sparse.csr_matrix, sparse ratings matrix with rows being items and cols being users
    """
    
    mat = sparse.lil_matrix((item_n, user_n))
    for _, row in df.iterrows():
        user_id = int(row[0])
        item_id = int(row[1])
        item_iid = id_to_iid[item_id]
        rating = row[2]
        mat[item_iid, user_id] = rating
    
    return mat 

In [19]:
item_user_mat = df_to_mat(train_df)
item_user_mat = item_user_mat.tocsr()
sparse.save_npz('ml-1m/item_user_mat.npz', item_user_mat)
item_user_mat.shape

(3883, 6040)

In [20]:
item_user_mat = sparse.load_npz('ml-1m/item_user_mat.npz')

In [21]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)
# fit the sparse matrix to the k nearest neighbor model
knn.fit(X_tfidf)

In [27]:
def recommend(user_id, topk):
    
    rated_before = np.nonzero(item_user_mat[:, user_id])[0]

    if rated_before.size > 0:
        # the user has at least reviewed one item
        raw_recommends = {}
        for item_iid in rated_before:
            distances, indices = knn.kneighbors(X_tfidf[item_iid], n_neighbors=topk+1)
            sorted_pairs = sorted(list(zip(indices.squeeze().tolist(),
                                           distances.squeeze().tolist())),
                                  key=lambda x: x[1])[:0:-1]
            raw_recommends[item_iid] = sorted_pairs
        
        # get the top 10 items
        top_items = []
        pos = 0
        while True:
            for item_iid in rated_before:
                next_neighbor_iid = raw_recommends[item_iid][pos][0]
                top_items.append(iid_to_id[next_neighbor_iid])
                if len(top_items) > topk - 1:
                    return (user_id, np.array(top_items))
            
            pos += 1
    else:
        # the user has no review
        top_items = list(map(lambda x: iid_to_id[x], random.sample(list(range(0, item_n)), topk)))
        return (user_id, np.array(top_items))

In [28]:
recommend(0, 10)

(0,
 array([2077, 1281,  854,  315, 1207, 2058, 3775,  593,  593,  593],
       dtype=int64))

**Make recommendations for all the users**

In [30]:
res = []
for user in range(user_n):
    res.append(recommend(user, 5))
user_recs_allinclude = {x[0]:x[1] for x in res}

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(user_recs_allinclude)
df.to_csv('user_recs_allinclude_df.csv', index=False)

## Evaluation

In [31]:
import math
# ground_truth: list of items ordered by time
def nDCG_Time(ground_truth, _recList):
    rec_num = len(_recList) # topK
    # ground_truth is already sorted by time
    idealOrder = ground_truth
    idealDCG = 0.0
    for j in range(min(rec_num, len(idealOrder))):
        idealDCG += ((math.pow(2.0, len(idealOrder) - j) - 1) / math.log(2.0 + j))

    recDCG = 0.0
    for j in range(rec_num):
        item = _recList[j]
        if item in ground_truth:
            rank = len(ground_truth) - ground_truth.index(item) # why ground truth?
            recDCG += ((math.pow(2.0, rank) - 1) / math.log(1.0 + j + 1))

    return (recDCG / idealDCG)


def Recall(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_test_set))


def Precision(_test_set, _recList):
    hit = len(set(_recList).intersection(set(_test_set)))
    return hit / float(len(_recList))

In [32]:
test_df.shape

(200042, 4)

In [38]:
test_mat = df_to_mat(test_df)

In [37]:
f_users = users[users['gender'] == 'F']
m_users=  users[users['gender'] == 'M']

In [42]:
test_f_user_ids = f_users.sample(100)['user_id'].to_numpy()
test_m_user_ids = m_users.sample(100)['user_id'].to_numpy()

In [52]:
r = []
p = []
n = []

for user_id in test_f_user_ids:
    test_items_iids = list(np.argwhere(test_mat[:, user_id] >= 4)[:, 0])
    test_items = list(map(lambda x: iid_to_id[x], test_items_iids))
    
    if len(test_items) > 0:
        top_items = list(recommend(user_id, 10)[1])
        
        recall = Recall(test_items, top_items)
        precision = Precision(test_items, top_items)
        ndcg = nDCG_Time(test_items, top_items)

        r.append(recall)
        p.append(precision)
        n.append(ndcg)

print("For 100 randomly sampled female users:")
print(" avg-precision: %.3f\n avg-recall: %.3f\n avg-nDCG: %.3f" %
       (np.average(p),np.average(r),np.average(n)))

For 100 randomly sampled female users:
 avg-precision: 0.005
 avg-recall: 0.003
 avg-nDCG: 0.000


In [53]:
r = []
p = []
n = []

for user_id in test_m_user_ids:
    test_items_iids = list(np.argwhere(test_mat[:, user_id] >= 4)[:, 0])
    test_items = list(map(lambda x: iid_to_id[x], test_items_iids))
    
    if len(test_items) > 0:
        top_items = list(recommend(user_id, 10)[1])
        
        recall = Recall(test_items, top_items)
        precision = Precision(test_items, top_items)
        ndcg = nDCG_Time(test_items, top_items)

        r.append(recall)
        p.append(precision)
        n.append(ndcg)

print("For 100 randomly sampled male users:")
print(" avg-precision: %.3f\n avg-recall: %.3f\n avg-nDCG: %.3f" %
       (np.average(p),np.average(r),np.average(n)))

For 100 randomly sampled male users:
 avg-precision: 0.008
 avg-recall: 0.005
 avg-nDCG: 0.006


## Latent Dirichlet Allocation

Use a smaller tf-idf vectorizer for lda: only use genres for tokens, no combinations of genres.

In [77]:
small_tf =  TfidfVectorizer(analyzer=lambda x: (c for i in range(1, 2) for c in combinations(x.split('|'), r=i)))

In [79]:
small_analyzer = small_tf.build_analyzer()
[token for token in small_analyzer('Action|Crime|Sci-Fi')]

[('Action',), ('Crime',), ('Sci-Fi',)]

In [80]:
small_X_tfidf = small_tf.fit_transform(movies['genres'])
small_X_tfidf.shape

(3883, 18)

3883 movies with 18 token (genres) weights.

In [81]:
lda = LatentDirichletAllocation(n_components=10) # 5 topics

In [82]:
X_lda = lda.fit_transform(small_X_tfidf)

In [83]:
lda.components_.shape

(10, 18)

The weights of 20 tokens in 10 latent topics.

In [84]:
genres = small_tf.get_feature_names()

In [85]:
for i, weights in enumerate(lda.components_):
    zipped = zip(genres, weights)
    top_genres_key = sorted(zipped, key = lambda t: t[1], reverse=True)[:4]
    top_genres_list = list(dict(top_genres_key).keys())
    print("Topic " + str(i) + ": ", top_genres_list)

Topic 0:  [('Romance',), ('Drama',), ('Action',), ('Adventure',)]
Topic 1:  [('Comedy',), ('Drama',), ('Western',), ('Action',)]
Topic 2:  [("Children's",), ('Adventure',), ('Fantasy',), ('Comedy',)]
Topic 3:  [('Romance',), ('Comedy',), ('Drama',), ('Musical',)]
Topic 4:  [('Action',), ('Sci-Fi',), ('Adventure',), ('Thriller',)]
Topic 5:  [('Thriller',), ('Crime',), ('Drama',), ('Action',)]
Topic 6:  [('War',), ('Musical',), ('Drama',), ('Action',)]
Topic 7:  [('Horror',), ('Sci-Fi',), ('Thriller',), ('Comedy',)]
Topic 8:  [('Mystery',), ('Animation',), ("Children's",), ('Thriller',)]
Topic 9:  [('Drama',), ('Documentary',), ('Western',), ('Comedy',)]
