In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark import SparkContext
import pyspark.sql

from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

#scripts
from src import rank_metrics
from src import helpers
from src import table_encoder
from src import metrics

In [3]:
spark = SparkSession(SparkContext())

In [4]:
data_path = '../data/csv'

In [5]:
def load_format_data(data_path):
    '''
    Reads all necessary csv files to pandas dataframes
    Data_path is the path to a folder containing csv files 
    loaded below. If running from our jupyter notebook, this 
    will be "../data/csv"
    '''
    ratings_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'ratings.csv')))
    movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'movies.csv')))
    encoded_movies_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'encoded_movies.csv'))).drop('Unnamed: 0', axis=1)
    tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path, 'tags.csv')))
    enoded_tags_df = pd.DataFrame(pd.read_csv(os.path.join(data_path,'encoded_tags.csv')))
    
    return ratings_df, movies_df, encoded_movies_df, tags_df, enoded_tags_df

In [6]:
ratings_df, movies_df, encoded_movies_df, tags_df, enoded_tags_df = load_format_data(data_path)

In [7]:
def load_format_data_for_model(data_path):
    '''
    Call load_format_data, drop irrelevant columns and return train/test
    data as spark dataframes
    '''
    ratings_df, movies_df, encoded_movies_df, tags_df, enoded_tags_df = load_format_data(data_path)
    ratings_df = ratings_df.drop('timestamp', axis=1)
    ratings = spark.createDataFrame(ratings_df)
    (train, test) = ratings.randomSplit([0.8, 0.2])
    return train, test

In [8]:
train, test = load_format_data_for_model(data_path)

In [9]:
def fit_als(training_data):
    als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
    model = als.fit(training_data)
    return model

In [10]:
model = fit_als(train)

In [11]:
def predict_for_all_users(test_data):
    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("RMSE = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(5)
    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(5)
    return userRecs, movieRecs

In [12]:
allUserRecs, allMovieRecs = predict_for_all_users(test)

RMSE = 1.0829909931081698


In [13]:
def predict_for_one_users(user_id, test_data, movie_df):
    sample = test_data[test_data['userId'] == user_id]
    predictions = model.transform(sample)
    prediction_ids = helpers.spark_to_pandas(predictions).sort_values('prediction', ascending=False)[:5]['movieId']
    prediction_titles = []
    for id in prediction_ids:
        title = movie_df[movie_df['movieId']==id]['title']
        prediction_titles.append(title.values[0])
    return prediction_titles

In [14]:
user_prediction = predict_for_one_users(5, test, movies_df)
user_prediction

['Eat Drink Man Woman (Yin shi nan nu) (1994)',
 'Dead Man Walking (1995)',
 'Fargo (1996)',
 'Shawshank Redemption, The (1994)',
 'Quiz Show (1994)']

In [15]:
def get_user_liked_movie_ids(user_id, ratings_df):
    user_rates = ratings_df[ratings_df['userId']==1]
    good_user_rates = user_rates[user_rates['rating']>3]
    liked_movies = list(good_user_rates['movieId'])
    return liked_movies

In [16]:
user_liked_movie_ids = get_user_liked_movie_ids(5, ratings_df)

In [17]:
def get_user_liked_movie_titles(user_id, ratings_df, movies_df):
    ids = get_user_liked_movie_ids(5, ratings_df)
    movie_titles = []
    for id in ids:
        movie_title = movies_df[movies_df['movieId']==id]['title']
        movie_titles.append(movie_title.values[0])
    return movie_titles

In [18]:
user_liked_movie_titles = get_user_liked_movie_titles(5, ratings_df, movies_df)[:5]

In [19]:
def get_ids_from_titles(title_list, movies_df):
    movie_ids = []
    for title in title_list:
        movie_ids.append(int(movies_df[movies_df['title']==title]['movieId']))
    return movie_ids

In [20]:
rec_movie_ids = get_ids_from_titles(user_prediction, movies_df)
rec_movie_ids

[232, 36, 608, 318, 300]

In [21]:
metrics.compare_recs_to_user_likes(rec_movie_ids, user_liked_movie_ids, encoded_genres=encoded_movies_df)

Unnamed: 0,movie ids,similarity
0,232,0.280194
1,36,0.213679
2,608,0.323127
3,318,0.213679
4,300,0.191866


In [22]:
metrics.compare_movie(232, 984, encoded_movies_df)

0.8165