In [42]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [43]:
import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql

from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

#scripts
from src import rank_metrics
from src import helpers
from src import table_encoder
from src import metrics

In [3]:
spark = SparkSession(SparkContext())

In [58]:
data_path = '../data/csv/'
ratings_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'ratings.csv')))
movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'movies.csv')))
encoded_movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'encoded_movies.csv')))
encoded_movies_df = encoded_movies_df.drop('Unnamed: 0', axis=1)
tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'tags.csv')))
enoded_tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path,'encoded_tags.csv')))

In [5]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
ratings_df = ratings_df.drop('timestamp', axis=1)

In [7]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [8]:
ratings = spark.createDataFrame(ratings_df)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [9]:
training.head()

Row(userId=1, movieId=1, rating=4.0)

In [10]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [11]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("RMSE = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(5)

RMSE = 1.0682942226879428


In [122]:
userRecs_df = helpers.spark_to_pandas(userRecs)
userRecs_df.head()

Unnamed: 0,userId,recommendations
0,471,"[(932, 9.259642601013184), (8638, 8.9710712432..."
1,463,"[(6857, 7.766437530517578), (3030, 6.936159610..."
2,496,"[(3272, 6.929464817047119), (4649, 6.589651107..."
3,148,"[(27611, 7.168304443359375), (68793, 6.6775064..."
4,540,"[(3030, 6.684593677520752), (3503, 6.585985183..."


In [107]:
top_movies_and_rankings = helpers.get_top_movies_and_ratings(userRecs_df, 1, movies_df)
top_movie_ratings = list(top_movies_and_rankings.values())
top_movie_recs = list(top_movies_and_rankings.keys())

In [108]:
top_movie_recs

['Dunkirk (2017)',
 'Sanjuro (Tsubaki Sanjûrô) (1962)',
 'Before Sunrise (1995)',
 'Yojimbo (1961)',
 'Key Largo (1948)']

In [109]:
user_rates = ratings_df[ratings_df['userId']==1]
good_user_rates = user_rates[user_rates['rating']>3]

In [127]:
good_user_rates = list(good_user_rates['movieId'])

In [110]:
rank_metrics.ndcg_at_k(top_movie_ratings, 5, 0)

1.0

In [111]:
movies_df[movies_df['movieId']==1111]['title'].values[0]

"Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)"

In [112]:
movies_df[movies_df['movieId']==1111]['title'].index[0]

844

In [113]:
top_movie_ratings

[7.291133403778076,
 6.976787567138672,
 6.975888729095459,
 6.872860908508301,
 6.766486167907715]

In [114]:
encoded_movies_df[encoded_movies_df['movieId']==1111]['title'].values[0]

"Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)"

In [115]:
movie_ids = []
for title in top_movie_recs:
    movie_ids.append(int(movies_df[movies_df['title']==title]['movieId']))
movie_ids

[174055, 2905, 215, 3030, 3334]

In [116]:
metrics.compare_movie(232, 984, encoded_genres = encoded_movies_df)

0.8165

In [117]:
helpers.user_liked_compared_recommended(ratings_df, movies_df, userRecs_df, 1)

users liked movies:
 ['Toy Story (1995)', 'Grumpier Old Men (1995)', 'Heat (1995)', 'Seven (a.k.a. Se7en) (1995)', 'Usual Suspects, The (1995)', 'Bottle Rocket (1996)', 'Braveheart (1995)', 'Rob Roy (1995)', 'Canadian Bacon (1995)', 'Desperado (1995)', 'Billy Madison (1995)', 'Dumb & Dumber (Dumb and Dumber) (1994)', 'Ed Wood (1994)', 'Star Wars: Episode IV - A New Hope (1977)', 'Tommy Boy (1995)', 'Clear and Present Danger (1994)', 'Forrest Gump (1994)', 'Jungle Book, The (1994)', 'Mask, The (1994)', 'Dazed and Confused (1993)', 'Fugitive, The (1993)', 'Jurassic Park (1993)', "Schindler's List (1993)", 'So I Married an Axe Murderer (1993)', 'Three Musketeers, The (1993)', 'Tombstone (1993)', 'Dances with Wolves (1990)', 'Batman (1989)', 'Silence of the Lambs, The (1991)', 'Pinocchio (1940)', 'Fargo (1996)', 'James and the Giant Peach (1996)', 'Rock, The (1996)', "She's the One (1996)", 'Wizard of Oz, The (1939)', 'Citizen Kane (1941)', 'Adventures of Robin Hood, The (1938)', 'Ghost an

In [118]:
movies_df[movies_df['title']=='Toy Story (1995)']

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [119]:
movies_df[movies_df['title']=='Pompatus of Love, The (1996)']

Unnamed: 0,movieId,title,genres
750,984,"Pompatus of Love, The (1996)",Comedy|Drama


In [128]:
metrics.compare_recs_to_user_likes(movie_ids, good_user_rates, encoded_genres=encoded_movies_df)

Unnamed: 0,movie ids,similarity
0,174055,0.292535
1,2905,0.35177
2,215,0.185038
3,3030,0.295158
4,3334,0.212563


In [137]:
# metrics.compare_movie(1254, 33, encoded_genres=encoded_movies_df)
encoded_movies_df[encoded_movies_df['movieId']==3334]['title']

2498    Key Largo (1948)
Name: title, dtype: object

In [129]:
good_user_rates

[1,
 3,
 6,
 47,
 50,
 101,
 110,
 151,
 157,
 163,
 216,
 231,
 235,
 260,
 333,
 349,
 356,
 362,
 367,
 441,
 457,
 480,
 527,
 543,
 552,
 553,
 590,
 592,
 593,
 596,
 608,
 661,
 733,
 804,
 919,
 923,
 940,
 943,
 954,
 1023,
 1024,
 1025,
 1029,
 1031,
 1032,
 1042,
 1049,
 1060,
 1073,
 1080,
 1089,
 1090,
 1092,
 1097,
 1127,
 1136,
 1196,
 1197,
 1198,
 1206,
 1208,
 1210,
 1213,
 1214,
 1220,
 1222,
 1224,
 1226,
 1240,
 1256,
 1265,
 1270,
 1275,
 1278,
 1282,
 1291,
 1298,
 1348,
 1473,
 1500,
 1517,
 1552,
 1573,
 1587,
 1617,
 1620,
 1625,
 1732,
 1777,
 1793,
 1804,
 1805,
 1920,
 1927,
 1954,
 1967,
 2000,
 2005,
 2012,
 2018,
 2028,
 2033,
 2046,
 2048,
 2054,
 2058,
 2078,
 2090,
 2094,
 2096,
 2099,
 2105,
 2115,
 2116,
 2137,
 2139,
 2141,
 2143,
 2161,
 2174,
 2193,
 2268,
 2273,
 2291,
 2329,
 2353,
 2366,
 2387,
 2395,
 2406,
 2427,
 2450,
 2459,
 2470,
 2478,
 2492,
 2502,
 2529,
 2542,
 2571,
 2580,
 2596,
 2616,
 2628,
 2640,
 2641,
 2644,
 2648,
 2654,
 269

In [106]:
# metrics.compare_movie(1254, 33, encoded_genres=encoded_movies_df)
encoded_movies_df[encoded_movies_df['movieId']==14]['title']

13    Nixon (1995)
Name: title, dtype: object

In [48]:
avg_cos_scores = []
for rec_id in recommendation_ids:
    similarities = []
    div = len(liked_ids)
    for liked_id in liked_ids:
        similarities.append(compare_movie(rec_id, liked_id, encoded_genres=encoded_genres))
    avg_cos_score = sum(similarities)/div
    avg_cos_scores.append(avg_cos_score)
pd.DataFrame(zip(recommendation_ids, avg_cos_scores))

NameError: name 'recommendation_ids' is not defined