<a href="https://colab.research.google.com/github/rachelwijaya/MovieRecommendation/blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [167]:
# import csv files
import time
import numpy as np
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving movies.csv to movies (1).csv
Saving user_ratings.csv to user_ratings (1).csv


In [168]:
import io
movies_df = pd.read_csv(io.BytesIO(uploaded['movies.csv']))
user_ratings_df = pd.read_csv(io.BytesIO(uploaded['user_ratings.csv']))
# Dataset is now stored in a Pandas Dataframe

# Jaccard Similarity

In [181]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

# Returns a condensed matrix of all the distances

 # Crosstabulation
movies_genre_df = pd.crosstab(movies_df['title'], movies_df['genres'])

# Find distance between all items
# WARNING: please be patient, calculating this is going to take a minute or two
t1 = time.perf_counter()
jaccard_distances = pdist(movies_genre_df.values, metric='jaccard')

t2 = time.perf_counter()
print('time elapsed: ', t2-t1)
# print(jaccard_distances)

# Get the data into a rectangular shape based on the previous matrix
square_jaccard_distances = squareform(jaccard_distances)
# print(square_jaccard_distances)

time elapsed:  137.99027016400214


In [182]:
# Convert the distances to a square matrix
jaccard_similarity_array = 1 - square_jaccard_distances
# print(jaccard_similarity_array)

# Create a dataframe for the distance of each book according to its genre
distance_df = pd.DataFrame(jaccard_similarity_array,
                           index=movies_genre_df.index,
                           columns=movies_genre_df.index)
# distance_df.head()

In [183]:
# Find a similar movie
movie_name = input("Enter a movie title: ")

# Find the values for the movie
# movie_name_genres = movies_genre_df[movies_genre_df.index == movie_name]
jaccard_similarity_series = distance_df.loc[movie_name]

# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending = False)

# Print the results
print("\nMovies you would like:")
print(ordered_similarities.head())


Enter a movie title: Jumanji (1995)

Movies you would like:
title
Escape to Witch Mountain (1975)              1.0
Golden Compass, The (2007)                   1.0
Indian in the Cupboard, The (1995)           1.0
The Cave of the Golden Rose (1991)           1.0
Darby O'Gill and the Little People (1959)    1.0
Name: Jumanji (1995), dtype: float64


# Cosine Similarity

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#helper functions
cv = CountVectorizer()

def get_title_from_index(index):
	return movies_df[movies_df.index == index]["title"].values[0]

def get_index_from_title(title):
	return movies_df[movies_df.title == title]["index"].values[0]

In [None]:
#selecting the feature
features = ['title', 'genres']
for feature in features:
	movies_df[feature] = movies_df[feature].fillna('')

In [None]:
#combining all selected features into one column
def combined_features(row):
    try:
        return row['title'] + " " + row['genres']
    except:
        print("Error:", row)

movies_df["index"] = range(len(movies_df))
movies_df["combined_features"] = movies_df.apply(combined_features,axis=1)
                                                    #will pass each row individually instead of columns

print("combined features:", movies_df["combined_features"].head())

combined features: 0    Toy Story (1995) Adventure|Animation|Children|...
1            Jumanji (1995) Adventure|Children|Fantasy
2               Grumpier Old Men (1995) Comedy|Romance
3        Waiting to Exhale (1995) Comedy|Drama|Romance
4            Father of the Bride Part II (1995) Comedy
Name: combined_features, dtype: object


In [None]:
def recommendedMovieList(user_movie):
    #creating a count matrix from the combined column
    count_matrix = cv.fit_transform(movies_df["combined_features"])

    #finding the cosine similarity value from the count matrix
    cosine_sim = cosine_similarity(count_matrix) 

    #getting the movieId from its title
    movie_index = get_index_from_title(user_movie)

    #getting a list of similar movies by calculating their similarity scores
    similar_movies = list(enumerate(cosine_sim[movie_index]))

    #sorting the similarity scores in descending order
    return sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [None]:
#asking for user input
user_movie = input("Please enter a movie: ")

Please enter a movie: Jumanji (1995)


In [None]:
t1 = time.perf_counter()
movie_list = recommendedMovieList(user_movie)

#printing the titles of 5 movies that are found to be similar
i = 0
for movie in movie_list:
		print(get_title_from_index(movie[0]))
		i += 1
		if i > 5:
			break
t2 = time.perf_counter()
print('time_elapsed: ', t2-t1)

Jumanji (1995)
Tall Tale (1995)
Casper (1995)
Toy Story (1995)
Balto (1995)
Gordy (1995)
time_elapsed:  1.0777893090007638


# User-based Recommendation (cosine_similarity)
cosine similarity is the measure of the angle between two documents in the high dimensional matrix space

as seen in this two dimensional example [here](https://imgur.com/a/NH9FMlH)

all values are between 0 and 1 where 1 is an exact match

In [None]:
ratings = pd.merge(movies_df,user_ratings_df).drop(['genres', 'timestamp'], axis=1)
ratings.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [None]:
userRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
userRatings.head()

#Remove movies that have less than 10 user's rating (that are not NaN) and fill NaN values with 0
userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)

In [None]:
#similarity Matrix
item_similarity_df = userRatings.corr(method='pearson') #in-build method of dataframe of standardizing rating given by all user 
item_similarity_df.head(100)

Another method of standardizing (instead of using the in-build pearson correlation function)

```
def standardize(row):
  new_row = (row - row.mean())/(row.max() - row.min())
  return new_row

ratings_std = ratings.apply(standardize)
print(ratings_std)

item_similarity = cosine_similarity(ratings_std.T)
print(item_similarity)
```



What is the purpose of standardizing?

To correct any user rating that are to harsh or to lenient 
for example,

We have a user's rating values of [1,5,0]

```
def standardize(row):

  #mean = (1+5+0)/3 = 2
  new_row = (row - row.mean())/(row.max() - row.min())

  #row - row.mean() = [(1,-2), (5,-2), (0,-2)] = [-1,3,-2]
  #new meane = 0

  #row.max() - row.min() = 5
  #new_row = [-1/5, 3/5 , -2/5]
  #new range = 3/5 - (-2/5) = 1

  return new_row
```



In [None]:
#Find the similarity between movies
def get_similar(movie_name,rating):
    similar_ratings = item_similarity_df[movie_name]*(rating-2.5)
    #Scale it based on the rating the user gave
    similar_ratings = similar_ratings.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_ratings

In [None]:
#Make a recommendation
#sample input for Cally to insert = [("Amazing Spider-Man, The (2012)",5),("Mission: Impossible III (2006)",4),("Toy Story 3 (2010)",2),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",4)]
print("NOTE: input format, please enter movie name then the rating(out of 5)")
movie_lst = [ ] 
n = int(input("Enter number of movies you want to rate : ")) 
  
for i in range(0, n): 
    ele = [input(), int(input())] 
    movie_lst.append(ele) 

similar_movies = pd.DataFrame()
for movie,rating in movie_lst:
    similar_movies = similar_movies.append(get_similar(movie,rating),ignore_index = True)

similar_movies.head(10)
print("Here are a list of 20 movies reccomended for you \n")
print(similar_movies.sum().sort_values(ascending=False).head(20))

NOTE: input format, please enter movie name then the rating(out of 5)


KeyboardInterrupt: ignored

# User-based Recommendation (K-Nearest Neighbours)

In [184]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix # convert into array matrix
from sklearn.neighbors import NearestNeighbors # unsupervised learning

# create dataframes from csv files
movies_df = pd.read_csv('movies.csv', usecols=['movieId','title'], dtype={'movieId': 'int32', 'title': 'str'})
rating_df = pd.read_csv('user_ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

# print("Movies Dataframe:\n", movies_df.head(10))
# print("User Rating Dataframe:\n", rating_df.head(10))

# Merge two of the dataframes on movieId as this is a common feature
knnDf = pd.merge(rating_df, movies_df, on='movieId')

movie_rated_df = knnDf.dropna(axis=0, subset=['title'])
# select title as the set that we want
# count the amount of ratings done of each movie, reset index so that title is not the index
# This ensures that if let's say a movie has only 1 user who rates it a 5 stars, doesn't skew the data
movie_rated_count = (movie_rated_df.groupby(by=['title'])["rating"]).count().reset_index().rename(columns={'rating':'ratingCount'})[['title', 'ratingCount']]


rateTotalMovieCount = movie_rated_df.merge(movie_rated_count, left_on = 'title', right_on = 'title', how = 'left')

# Create pivot matrix
threshold = 30
rate_popular_movie = rateTotalMovieCount.query('ratingCount >= @threshold')
movie_features_pivot = rate_popular_movie.pivot_table(index ='title', columns='userId', values='rating').fillna(0)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
movie_features_df_matrix = csr_matrix(movie_features_pivot.values)
model_knn.fit(movie_features_df_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine', metric_params=None, n_jobs=None, n_neighbors=4, p=1.5, radius=1.0)

user_movie = input("Please enter a movie: ")
index = movie_features_pivot.index.get_loc(user_movie)
distances, indices = model_knn.kneighbors(movie_features_pivot.iloc[index, :].values.reshape(1, -1), n_neighbors = 6)


for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Recommendations for {0}:\n".format(movie_features_pivot.index[index]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i, movie_features_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Please enter a movie: Jumanji (1995)
Recommendations for Jumanji (1995):

1: Lion King, The (1994), with distance of 0.411562442779541:
2: Mrs. Doubtfire (1993), with distance of 0.45018184185028076:
3: Mask, The (1994), with distance of 0.45501887798309326:
4: Jurassic Park (1993), with distance of 0.4619544744491577:
5: Home Alone (1990), with distance of 0.47512364387512207:
