### Recommendation Algos Demo - Ryan

In [2]:
import pandas as pd
import numpy as np
import sklearn
from zipfile import ZipFile
from dask import dataframe as dd 
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate
from collections import defaultdict
from surprise import KNNBaseline
from pyspark.sql import SQLContext, SparkSession
from pyspark.ml.recommendation import ALS
from pyspark import SparkContext, SparkConf
import implicit
import boto3
from sklearn.utils import shuffle
# %load_ext line_profiler
#zip = ZipFile('data/ml-latest-small.zip')
#zip.extractall()

In [104]:
df_dict = {}
for name in {'links', 'movies', 'ratings', 'tags'}:
    df_dict[f'{name}'] = pd.read_csv(f'data/ml-latest-small/{name}.csv')
    
movies_df = df_dict['movies']
links_df = df_dict['links']
ratings_df = df_dict['ratings']
tags_df = df_dict['tags']

### MovieLens Dataset 

https://grouplens.org/datasets/movielens/


In [107]:
movies_df.set_index('movieId')

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
ratings_df.head(7)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868


In [108]:
df_merged = movies_df.merge(ratings_df, on='movieId').merge(links_df, on="movieId")
input_df_model =  df_merged.loc[:, ['userId', 'movieId', 'rating', 'title']]

movie_id_name_df = input_df_model.loc[:, ['title', 'movieId']].drop_duplicates()
keys = movie_id_name_df['movieId']
values = movie_id_name_df['title']

id_to_title_map = {key:value for key, value in zip(keys, values)}
title_to_id_map = {value:key for key, value in zip(keys, values)}
    
#dict(sorted(movie_id_name_df.values.tolist()))

### SVD

In [11]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [9]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(input_df_model[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()

algo = SVD()
algo.fit(trainset)

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [14]:
top_n = get_top_n(predictions, n=5)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(f'Recommendations for User {uid}:', eval(str([id_to_title_map[iid] for (iid, _) in user_ratings])), '\n\n')


Recommendations for User 1: ['Shawshank Redemption, The (1994)', 'In the Name of the Father (1993)', 'Ghost in the Shell (Kôkaku kidôtai) (1995)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Godfather, The (1972)'] 


Recommendations for User 5: ['Notorious (1946)', 'Laputa: Castle in the Sky (Tenkû no shiro Rapyuta) (1986)', 'Boogie Nights (1997)', 'North by Northwest (1959)', 'Evil Dead II (Dead by Dawn) (1987)'] 


Recommendations for User 7: ['North by Northwest (1959)', 'Rear Window (1954)', 'Streetcar Named Desire, A (1951)', 'Dark Knight, The (2008)', 'Last King of Scotland, The (2006)'] 


Recommendations for User 15: ['Goodfellas (1990)', 'Fargo (1996)', 'Lawrence of Arabia (1962)', 'Godfather: Part II, The (1974)', 'Great Escape, The (1963)'] 


Recommendations for User 17: ['Life Is Beautiful (La Vita è bella) (1997)', 'Lawrence of Arabia (1962)', 'Trainspotting (1996)', "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)", 'Boondock Sai

Recommendations for User 435: ['Great Escape, The (1963)', 'Streetcar Named Desire, A (1951)', 'Boot, Das (Boat, The) (1981)', 'Philadelphia Story, The (1940)', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)'] 


Recommendations for User 29: ['Indiana Jones and the Last Crusade (1989)', 'Wallace & Gromit: The Best of Aardman Animation (1996)', 'Fight Club (1999)', 'Star Wars: Episode IV - A New Hope (1977)', 'Cool Hand Luke (1967)'] 


Recommendations for User 69: ['Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 'Lawrence of Arabia (1962)', 'Apocalypse Now (1979)', 'Seven Samurai (Shichinin no samurai) (1954)', 'Fight Club (1999)'] 


Recommendations for User 197: ['Lost in Translation (2003)', 'Trainspotting (1996)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Boondock Saints, The (2000)'] 


Recommendations for User 211: ['Casabl

### K-NN with similarity measures

In [80]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False  # compute  similarities between items
               }

algo_knn = KNNBaseline(sim_options=sim_options)
algo_knn.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0xceb74c390>

In [91]:
movie = 'Jumanji (1995)'

# Retrieve inner id of the movie Toy Story
movie_raw_id = title_to_id_map[movie]
movie_inner_id = algo_knn.trainset.to_inner_iid(movie_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
movie_neighbors = algo_knn.get_neighbors(movie_inner_id, k=10)

# Convert inner ids of the neighbors into names.
movie_neighbors = (algo_knn.trainset.to_raw_iid(inner_id)
                       for inner_id in movie_neighbors)
movie_neighbors = (id_to_title_map[rid]
                       for rid in movie_neighbors)

print()
print(f'The 10 nearest neighbors of {movie} are:\n')
for movie in movie_neighbors:
    print(movie)


The 10 nearest neighbors of Jumanji (1995) are:

Mrs. Doubtfire (1993)
Mask, The (1994)
Back to the Future (1985)
Liar Liar (1997)
True Lies (1994)
Casper (1995)
Back to the Future Part III (1990)
Santa Clause, The (1994)
Prestige, The (2006)
Babe (1995)


### AWS Personalize

In [148]:
## data format for dumping into AWS S3

ratings = shuffle(ratings_df)
ratings = ratings[ratings['rating']>3.6]
ratings = ratings.drop(columns='rating')
ratings.columns = ['USER_ID','ITEM_ID','TIMESTAMP']
ratings = ratings[:100000]
#ratings.to_csv('ratings.processed.csv',index=False)


In [149]:
one_movie_user = ratings.drop_duplicates(subset="USER_ID").drop('TIMESTAMP', axis=1).reset_index(drop=True)
one_movie_user_df = one_movie_user.merge(movies_df, right_on='movieId', left_on='ITEM_ID').drop('movieId', axis=1)
one_movie_user_df.head(25)

Unnamed: 0,USER_ID,ITEM_ID,title,genres
0,69,1721,Titanic (1997),Drama|Romance
1,593,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
2,510,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
3,424,4450,Bully (2001),Crime|Drama|Thriller
4,602,357,Four Weddings and a Funeral (1994),Comedy|Romance
5,372,357,Four Weddings and a Funeral (1994),Comedy|Romance
6,443,608,Fargo (1996),Comedy|Crime|Drama|Thriller
7,220,608,Fargo (1996),Comedy|Crime|Drama|Thriller
8,302,608,Fargo (1996),Comedy|Crime|Drama|Thriller
9,120,608,Fargo (1996),Comedy|Crime|Drama|Thriller


#### AWS results using AutoML - HRNN

In [151]:
idx = 19

campaign_arn = "arn:aws:personalize:us-east-1:376337229415:campaign/aws-automl"
user_id, item_id = one_movie_user_df.iloc[idx, :2]
personalize = boto3.client(service_name='personalize', endpoint_url='https://personalize.us-east-1.amazonaws.com')
personalize_runtime = boto3.client(service_name='personalize-runtime', endpoint_url='https://personalize-runtime.us-east-1.amazonaws.com')
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = str(user_id),
    itemId = str(item_id)
)

item_list = get_recommendations_response['itemList']

user_movie = id_to_title_map[item_id]
print(f"For user {user_id} who likes '{user_movie}', we can recommend:\n")

for item in item_list:
    print(id_to_title_map[int(item['itemId'])])

For user 499 who likes 'Velvet Goldmine (1998)', we can recommend:

Training Day (2001)
Bill & Ted's Excellent Adventure (1989)
2001: A Space Odyssey (1968)
Schindler's List (1993)
Airplane! (1980)
Pleasantville (1998)
Naked Gun: From the Files of Police Squad!, The (1988)
Mary Poppins (1964)
Princess Bride, The (1987)
Signs (2002)
Pi (1998)
Big Trouble in Little China (1986)
Blade Runner (1982)
Illusionist, The (2006)
Office Space (1999)
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)
High Fidelity (2000)
Escape from New York (1981)
Austin Powers: International Man of Mystery (1997)
Rushmore (1998)
Usual Suspects, The (1995)
Close Encounters of the Third Kind (1977)
Rounders (1998)
Ocean's Eleven (2001)
Collateral (2004)
