In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql

from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

#scripts
from src import rank_metrics
from src import helpers
from src import table_encoder
from src import metrics

In [3]:
spark = SparkSession(SparkContext())

In [4]:
data_path = '../data/csv/'
ratings_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'ratings.csv')))
movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'movies.csv')))
encoded_movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'encoded_movies.csv')))
tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'tags.csv')))
enoded_tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path,'encoded_tags.csv')))

In [5]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [6]:
ratings_df = ratings_df.drop('timestamp', axis=1)

In [7]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [8]:
ratings = spark.createDataFrame(ratings_df)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [9]:
training.head()

Row(userId=1, movieId=1, rating=4.0)

In [10]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [11]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("RMSE = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(5)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(5)

RMSE = 1.0900114098353333


In [12]:
userRecs_df = helpers.spark_to_pandas(userRecs)
userRecs_df

Unnamed: 0,userId,recommendations
0,471,"[(69406, 8.646392822265625), (3676, 8.62685394..."
1,463,"[(176371, 6.762641906738281), (3676, 6.7588529..."
2,496,"[(52435, 10.614336013793945), (70946, 10.36056..."
3,148,"[(176371, 6.391848564147949), (177765, 6.30470..."
4,540,"[(177593, 6.626405239105225), (2318, 6.5696363..."
...,...,...
605,208,"[(80693, 9.882692337036133), (1958, 9.13183689..."
606,401,"[(80693, 5.3502044677734375), (971, 5.25591182..."
607,422,"[(3676, 6.344318389892578), (69644, 5.63129377..."
608,517,"[(87, 6.335902214050293), (46, 6.1987605094909..."


In [13]:
top_movies_and_rankings = helpers.get_top_movies_and_ratings(userRecs_df, 90, movies_df)
top_movie_ratings = list(top_movies_and_rankings.values())
top_movie_recs = list(top_movies_and_rankings.keys())

In [14]:
top_movie_recs

['Professional, The (Le professionnel) (1981)',
 'Last Tango in Paris (Ultimo tango a Parigi) (1972)',
 'Metropolitan (1990)',
 "Jacob's Ladder (1990)",
 'When We Were Kings (1996)']

In [15]:
user_rates = ratings_df[ratings_df['userId']==90]
good_user_rates = user_rates[user_rates['rating']>3]

In [16]:
user_movies = []
for movieId in list(good_user_rates['movieId']):
    user_movies.append(movies_df[movies_df['movieId']==movieId]['title'].values[0])
user_movies

['Sabrina (1995)',
 'Nixon (1995)',
 'Sense and Sensibility (1995)',
 'Leaving Las Vegas (1995)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Dead Man Walking (1995)',
 'Mighty Aphrodite (1995)',
 'Postman, The (Postino, Il) (1994)',
 "Antonia's Line (Antonia) (1995)",
 'Angels and Insects (1995)',
 'Anne Frank Remembered (1995)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Welcome to the Dollhouse (1995)',
 'Celluloid Closet, The (1995)',
 'Fargo (1996)',
 'Jane Eyre (1996)',
 'Flower of My Secret, The (La flor de mi secreto) (1995)',
 'Cemetery Man (Dellamorte Dellamore) (1994)',
 'I Shot Andy Warhol (1996)',
 'Last Klezmer: Leopold Kozlowski, His Life and Music, The (1994)',
 'Very Brady Sequel, A (1996)',
 'Twelfth Night (1996)',
 'Pompatus of Love, The (1996)',
 'Freeway (1996)',
 'Secrets & Lies (1996)',
 'Beautiful Thing (1996)',
 'Everyone Says I Love You (1996)',
 'Swingers (1996)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Single Girl, A (Fille seule, La) (1995

In [17]:
rank_metrics.ndcg_at_k(top_movie_ratings, 5, 0)

1.0

In [18]:
movies_df[movies_df['movieId']==1111]['title'].values[0]

"Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)"

In [19]:
movies_df[movies_df['movieId']==1111]['title'].index[0]

844

In [20]:
top_movie_ratings

[6.023739814758301,
 6.01416540145874,
 6.006021022796631,
 5.917660236358643,
 5.916294097900391]

In [21]:
encoded_movies_df[encoded_movies_df['movieId']==1111]['title'].values[0]

"Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)"

In [22]:
movie_ids = []
for title in top_movie_recs:
    movie_ids.append(int(movies_df[movies_df['title']==title]['movieId']))
movie_ids

[5782, 7008, 1966, 3476, 1147]

In [23]:
metrics.compare_movie(232, 984, encoded_genres = encoded_movies_df)

array([[0.99997343]])

In [24]:
helpers.user_liked_compared_recommended(ratings_df, movies_df, userRecs_df, 90)

users liked movies:
 ['Sabrina (1995)', 'Nixon (1995)', 'Sense and Sensibility (1995)', 'Leaving Las Vegas (1995)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Dead Man Walking (1995)', 'Mighty Aphrodite (1995)', 'Postman, The (Postino, Il) (1994)', "Antonia's Line (Antonia) (1995)", 'Angels and Insects (1995)', 'Anne Frank Remembered (1995)', 'Star Wars: Episode IV - A New Hope (1977)', 'Welcome to the Dollhouse (1995)', 'Celluloid Closet, The (1995)', 'Fargo (1996)', 'Jane Eyre (1996)', 'Flower of My Secret, The (La flor de mi secreto) (1995)', 'Cemetery Man (Dellamorte Dellamore) (1994)', 'I Shot Andy Warhol (1996)', 'Last Klezmer: Leopold Kozlowski, His Life and Music, The (1994)', 'Very Brady Sequel, A (1996)', 'Twelfth Night (1996)', 'Pompatus of Love, The (1996)', 'Freeway (1996)', 'Secrets & Lies (1996)', 'Beautiful Thing (1996)', 'Everyone Says I Love You (1996)', 'Swingers (1996)', 'Willy Wonka & the Chocolate Factory (1971)', 'Single Girl, A (Fille seule, La) (1995)', 'Hus

In [25]:
movies_df[movies_df['title']=='Toy Story (1995)']

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [26]:
movies_df[movies_df['title']=='Pompatus of Love, The (1996)']

Unnamed: 0,movieId,title,genres
750,984,"Pompatus of Love, The (1996)",Comedy|Drama
