<a href="https://colab.research.google.com/github/safakan/CCTB_repo_safak_atakan_celik/blob/main/DEA113/NOTES_Build_Recommendation_Systems_for_Movies_Like_Netflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Recommendation Systems

## Libraries

In [1]:
%pip install tqdm==4.66.4  | tail -n 1
%pip install pandas==2.1.4  | tail -n 1
%pip install scikit-learn==1.5.1  | tail -n 1

Successfully installed tqdm-4.66.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.1.4 which is incompatible.
plotnine 0.14.5 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.
mizani 0.13.1 requires pandas>=2.2.0, but you have pandas 2.1.4 which is incompatible.[0m[31m
[0mSuccessfully installed pandas-2.1.4
Successfully installed scikit-learn-1.5.1


In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import statistics
import kagglehub

# just for practice sake
def warn(*args, **kwargs):
    pass

import warnings

warnings.warn = warn
warnings.filterwarnings('ignore')


## Extract: Movie Lens Small Latest Dataset

In [3]:
# import kagglehub
# Download latest version
path = kagglehub.dataset_download("shubhammehta21/movie-lens-small-latest-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubhammehta21/movie-lens-small-latest-dataset?dataset_version_number=1...


100%|██████████| 971k/971k [00:00<00:00, 40.9MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/shubhammehta21/movie-lens-small-latest-dataset/versions/1





In [4]:
df_movies = pd.read_csv(path + "/movies.csv")
df_ratings = pd.read_csv(path + "/ratings.csv")
df_tags = pd.read_csv(path + "/tags.csv")

In [5]:
df_movies.sample(5)

Unnamed: 0,movieId,title,genres
5998,37444,Frankenstein 90 (1984),Comedy|Horror|Romance
7292,75805,"Bounty Hunter, The (2010)",Action|Comedy|Romance
6456,52299,American Hardcore (2006),Documentary
5515,26498,Boy Meets Girl (1984),Drama
2253,2990,Licence to Kill (1989),Action|Adventure|Thriller


In [6]:
df_ratings.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
58317,381,4226,4.5,1168800601
62612,414,829,2.0,961439584
45124,298,55292,0.5,1479064933
54199,357,919,4.5,1348610304
62220,412,2712,5.0,939113740


In [7]:
df_tags.sample(5)

Unnamed: 0,userId,movieId,tag,timestamp
2267,474,7234,circus,1137206103
951,424,104879,morality,1457846368
3019,567,4878,psychological,1525282595
3523,599,296,random,1498456503
2839,537,79132,psychology,1424140162


## Check, Transform, PreProcess dataset

Merging all three dataframes into one dataframe to be used: movies, ratings, tags

In [8]:
df_movies_n_ratings = df_movies.merge(df_ratings, on="movieId", how="inner")
df_movies_n_ratings_n_tags = df_movies_n_ratings.merge(df_tags, on=["movieId", "userId"], how="inner")
df = df_movies_n_ratings_n_tags
df

Unnamed: 0,movieId,title,genres,userId,rating,timestamp_x,tag,timestamp_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,1122227329,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,978575760,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,1525286001,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,1528843890,magic board game,1528843932
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,1528934550,star wars,1528934552
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,anime,1537098582
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,comedy,1537098587
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,1537098554,gintama,1537098603


Exploratory Data Analysis (EDA)

In [9]:
# shape: row and column count
df.shape

(3476, 8)

In [10]:
# data types
df.dtypes

Unnamed: 0,0
movieId,int64
title,object
genres,object
userId,int64
rating,float64
timestamp_x,int64
tag,object
timestamp_y,int64


In [11]:
# missing values
df.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0
userId,0
rating,0
timestamp_x,0
tag,0
timestamp_y,0


Dropping the timestamp columns because they're irrelevant in this case

In [12]:
df.drop(columns=["timestamp_x", "timestamp_y"], inplace=True)
df

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


In [13]:
df.shape

(3476, 6)

## Recommendation Systems

### Popularity-based recommendation

In [14]:
# copy the dataframe for this specific system's use case
df_popularity = df

In [15]:
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game
...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama


Number of votes (numVotes) Feature

In [16]:
# calculate and store
num_votes = df_popularity.groupby("movieId").size().reset_index(name="numVotes")


# merge
df_popularity = pd.merge(df_popularity, num_votes, on="movieId")
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4
...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4


Average Rating (avgRating) Feature

In [17]:
# calculate and store
avg_rating = df_popularity.groupby("movieId")["rating"].mean().reset_index(name="avgRating")

# merge
df_popularity = pd.merge(df_popularity, avg_rating, on="movieId")
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,4.0,pixar,3,3.833333
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,3.5,fun,3,3.833333
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,magic board game,4,3.750000
...,...,...,...,...,...,...,...,...
3471,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,star wars,2,4.000000
3472,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,anime,4,3.500000
3473,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,comedy,4,3.500000
3474,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184,3.5,gintama,4,3.500000


Cleaning by removing unnecessary columns and rows

In [23]:
# df_popularity.drop(columns=["genres", "userId", "rating", "tag"], inplace=True)
df_popularity.drop_duplicates(subset = ['movieId', 'title', 'avgRating', 'numVotes'], inplace=True)
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,weighted_score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333,3.987909
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000,3.963556
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000,3.848616
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000,3.737717
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000,3.957137
...,...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000,4.014177
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000,3.935373
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000,4.014177
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000,4.014963


Calculating Weighted Scores

In [24]:
import statistics

def calculate_weighted_score(avg_rating, num_votes, C, m):
  return (num_votes * avg_rating + m * C) / (num_votes + m)

# Calculating the global average rating (C)
average_avgRating = statistics.mean(df_popularity["avgRating"])

# Calculating the average number of votes (m) - threshold
average_numVotes = statistics.mean(df_popularity["numVotes"])

# Calculating and storing weighted scores
df_popularity["weighted_score"] = df_popularity.apply(lambda row: calculate_weighted_score(row["avgRating"], row["numVotes"], average_avgRating, average_numVotes), axis=1)
df_popularity

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,weighted_score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,4.0,pixar,3,3.833333,3.788714
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,4.0,fantasy,4,3.750000,3.743421
7,3,Grumpier Old Men (1995),Comedy|Romance,289,2.5,moldy,2,2.500000,3.168895
9,5,Father of the Bride Part II (1995),Comedy,474,1.5,pregnancy,2,1.500000,2.711680
11,7,Sabrina (1995),Comedy|Romance,474,3.0,remake,1,3.000000,3.515304
...,...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,4.0,Comedy,3,4.000000,3.881749
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,3.5,adventure,3,3.500000,3.602644
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,4.0,Josh Brolin,3,4.000000,3.881749
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,4.0,Emilia Clarke,2,4.000000,3.854716


Exercise 1 - Get the top 5 suggestions sorting by score in descending order

In [25]:
df_popularity.sort_values(by="weighted_score", ascending=False).head(5)

Unnamed: 0,movieId,title,genres,userId,rating,tag,numVotes,avgRating,weighted_score
199,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,103,5.0,good dialogue,181,4.983425,4.967226
1337,2959,Fight Club (1999),Action|Crime|Drama|Thriller,424,4.5,dark comedy,54,4.944444,4.893394
604,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,474,4.0,Hal,41,4.95122,4.884498
998,1732,"Big Lebowski, The (1998)",Comedy|Crime,474,3.5,Coen Brothers,32,4.953125,4.868802
164,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,166,4.5,assassin,35,4.928571,4.852577


### Content Based Recommendation

In [29]:
df_content = df_popularity.copy()
df_content.drop(columns=["rating"], inplace=True)

In [30]:
df_content

Unnamed: 0,movieId,title,genres,userId,tag,numVotes,avgRating,weighted_score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,3,3.833333,3.788714
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,4,3.750000,3.743421
7,3,Grumpier Old Men (1995),Comedy|Romance,289,moldy,2,2.500000,3.168895
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy,2,1.500000,2.711680
11,7,Sabrina (1995),Comedy|Romance,474,remake,1,3.000000,3.515304
...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,Comedy,3,4.000000,3.881749
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,adventure,3,3.500000,3.602644
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,Josh Brolin,3,4.000000,3.881749
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,Emilia Clarke,2,4.000000,3.854716


**Creating user documents/features**

In [32]:
# creating a list of terms based on genres and tags
df_content["features"] = df_content["genres"].str.replace("|", " ") + " " + df_content["tag"].fillna("")
df_content

Unnamed: 0,movieId,title,genres,userId,tag,numVotes,avgRating,weighted_score,features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,3,3.833333,3.788714,Adventure Animation Children Comedy Fantasy pixar
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,4,3.750000,3.743421,Adventure Children Fantasy fantasy
7,3,Grumpier Old Men (1995),Comedy|Romance,289,moldy,2,2.500000,3.168895,Comedy Romance moldy
9,5,Father of the Bride Part II (1995),Comedy,474,pregnancy,2,1.500000,2.711680,Comedy pregnancy
11,7,Sabrina (1995),Comedy|Romance,474,remake,1,3.000000,3.515304,Comedy Romance remake
...,...,...,...,...,...,...,...,...,...
3461,183611,Game Night (2018),Action|Comedy|Crime|Horror,62,Comedy,3,4.000000,3.881749,Action Comedy Crime Horror Comedy
3464,184471,Tomb Raider (2018),Action|Adventure|Fantasy,62,adventure,3,3.500000,3.602644,Action Adventure Fantasy adventure
3467,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,62,Josh Brolin,3,4.000000,3.881749,Action Comedy Sci-Fi Josh Brolin
3470,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62,Emilia Clarke,2,4.000000,3.854716,Action Adventure Children Sci-Fi Emilia Clarke


**Vectorizing features with TF-IDF method**

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

# Creating TF-IDF vectors by fitting and transforming features
X = vectorizer.fit_transform(df_content["features"])
X

<1464x852 sparse matrix of type '<class 'numpy.float64'>'
	with 5587 stored elements in Compressed Sparse Row format>

**Defining a recommendation function**


In [85]:
# Recommendation function
def recommendation(title_entered, df, similarity_matrix, top_n=3):
  try:
    # getting the index of the movie that matches the title
    idx = df[df["title"] == title_entered].index[0]
  except IndexError:
    print(f"Movie: {title_entered}, not found in the dataset.")

  # get the similarity scores for the index | row in the similarity matrix for the index
  similarity_scores_for_the_title_entered = list(enumerate(similarity_matrix[idx]))

  # order with values
  sorted_similarity_scores_for_the_title_entered = sorted(similarity_scores_for_the_title_entered, key=lambda x: x[1], reverse=True)

  # print the top n similar movies (excluding itself)
  print(f"Top {top_n} movies similar to {title_entered}")
  for i, (index, score) in enumerate(sorted_similarity_scores_for_the_title_entered[1: top_n + 1]):
    movie = df.iloc[index]
    print(f"{i}. {movie['title']} (Similarity Score: {score:.3f})")
    print(f"   Genres: {movie['genres']}")
    print(f"   Tag: {movie['tag']}\n")

**Using the recommendation function**

In [86]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(X)

recommendation("Toy Story (1995)", df_content, similarity_matrix)

Top 3 movies similar to Toy Story (1995)
0. Bug's Life, A (1998) (Similarity Score: 0.939)
   Genres: Adventure|Animation|Children|Comedy
   Tag: Pixar

1. Toy Story 2 (1999) (Similarity Score: 0.675)
   Genres: Adventure|Animation|Children|Comedy|Fantasy
   Tag: animation

2. Sintel (2010) (Similarity Score: 0.583)
   Genres: Animation|Fantasy
   Tag: adventure



### Collaborative Filtering

In [94]:
# pivot table of user's ratings on movies, empty values filled with 0
user_rating_matrix = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0)
user_rating_matrix.head(5)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


Initializing a Nearest Neighbour model by fitting with user_rating_matrix and using cosine similarity as the metrix for the calculations

In [95]:
from sklearn.neighbors import NearestNeighbors

recommender = NearestNeighbors(metric="cosine")
recommender.fit(user_rating_matrix)

Defining a function to recommend 5 movies based on a movie watched/entered

In [146]:
def get_collaborative_recommendation(title, top_n):
  # get movie details
  movie =  df_content[df_content["title"] == title]

  if movie.empty:
    print(f"Can not find the {title} in the dataset.")
    return None

  movie_id = int(movie["movieId"])

  # get the index of the movie in the user rating matrix generated above
  try:
    movie_index = user_rating_matrix.index.get_loc(movie_id)
  except KeyError:
    print(f"Movie ID {movie_id} not found in the user rating matrix")
    return None

  # get the user ratings for the movie
  user_ratings_for_the_movie = user_rating_matrix.iloc[movie_index]

  # isolate the ratings/values and reshape to be a single sample (1, -1), 2D array, to fit expectations of the model input
  reshaped_user_ratings_for_the_movie = user_ratings_for_the_movie.values.reshape(1, -1)

  # find the nearest neighbors (movies with similar ratings from all users)
  distances, indexes = recommender.kneighbors(reshaped_user_ratings_for_the_movie, n_neighbors=top_n + 1)

  # getting the movie indexes of the nearest neighbors (excluding the first which is the entered movie itself)
  nearest_movie_indexes = user_rating_matrix.iloc[indexes[0]].index[1:]

  # details of the nearest neighbors
  nearest_neighbors = pd.DataFrame({"movieId": nearest_movie_indexes})
  result = pd.merge(nearest_neighbors, df_content, on="movieId", how="left").head(top_n + 1)

  # return the results
  return result

**Using the function to get collaborative filtering recommendations**

In [148]:
get_collaborative_recommendation("Toy Story (1995)", 5)

Unnamed: 0,movieId,title,genres,userId,tag,numVotes,avgRating,weighted_score,features
0,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,62,animation,8,3.125,3.263998,Adventure Animation Children Comedy Fantasy an...
1,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,474,Dinosaur,1,4.5,3.959838,Action Adventure Sci-Fi Thriller Dinosaur
2,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,474,aliens,1,4.0,3.81166,Action Adventure Sci-Fi Thriller aliens
3,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,63,classic,18,4.527778,4.435081,Action Adventure Sci-Fi classic
4,356,Forrest Gump (1994),Comedy|Drama|Romance|War,474,shrimp,9,3.666667,3.680375,Comedy Drama Romance War shrimp


In [149]:
get_collaborative_recommendation("Jumanji (1995)", 5)

Unnamed: 0,movieId,title,genres,userId,tag,numVotes,avgRating,weighted_score,features
0,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,319.0,Disney,5.0,4.8,4.456243,Adventure Animation Children Drama Musical IMA...
1,500,Mrs. Doubtfire (1993),Comedy|Drama,474.0,cross dressing,3.0,2.5,3.044433,Comedy Drama cross dressing
2,367,,,,,,,,
3,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,474.0,Dinosaur,1.0,4.5,3.959838,Action Adventure Sci-Fi Thriller Dinosaur
4,586,Home Alone (1990),Children|Comedy,474.0,christmas,1.0,2.0,3.218948,Children Comedy christmas


---

some notes

In [73]:
# X: rows movies, columns terms, values vectors
# cosine_similarity(X): pairwise comparison matrix, rows movies, columns movies, values similarity scores
# X.shape
# similarity[0][1] # Toy Story (1995) COMPARED_TO Jumanji (1995)
# list(enumerate(similarity[0])) # enumerated list of similarity scores for the 0th movie against all movies
# list(enumerate(similarity[0]))[0][1] # gives the 0th movie's similarity score against the 0th movie which is 1

# scores = list(enumerate(similarity[0]))
# sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
# sorted_scores
# list(enumerate(sorted_scores))
    # [
    # (0, (0, 1.0)),
    # (1, (517, 0.9387351583006996)), ... ]





# user_rating_matrix.index
# user_rating_matrix.index.get_loc(1)
# test_user_ratings = user_rating_matrix.iloc[0]
# test_user_ratings
# test_user_ratings.values.reshape(1, -1)
# reshaped_df = test_user_ratings.values.reshape(1, -1)
# reshaped_df
# distances, indexes = recommender.kneighbors(reshaped_df, n_neighbors=15)
# (array([[3.33066907e-16, 4.27398740e-01, 4.34363196e-01, 4.35738306e-01,
#          4.42611829e-01, 4.52904092e-01, 4.58854651e-01, 4.58910695e-01,
#          4.61087229e-01, 4.65831237e-01, 4.69618653e-01, 4.72023202e-01,
#          4.72140834e-01, 4.79675476e-01, 4.81967257e-01]]),
#  array([[   0, 2353,  418,  615,  224,  314,  322,  910,  546,  963,  968,
#          3189,  506,  123,  257]]))

# 3.33066907e-16 considered as 0
# 0.000000000000000333066907
# user_rating_matrix.iloc[indexes[0]] # gets user ratings for all neigbors indexes
# nearest_movie_indexes = user_rating_matrix.iloc[indexes[0]].index[1:] # isolates the indexes except the 1st which is itself

# pd.DataFrame({"movieId": nearest_movie_indexes})




[(0, (0, 1.0)),
 (1, (517, 0.9387351583006996)),
 (2, (628, 0.6748152306916122)),
 (3, (1342, 0.5833168262375243)),
 (4, (1326, 0.549767262738691)),
 (5, (1, 0.5417866490111316)),
 (6, (1301, 0.5417866490111316)),
 (7, (485, 0.5377552811818821)),
 (8, (1187, 0.5151744406978072)),
 (9, (1180, 0.515073054918088)),
 (10, (236, 0.504471992351387)),
 (11, (969, 0.5041423569944117)),
 (12, (608, 0.5005947011042654)),
 (13, (954, 0.487508724995261)),
 (14, (1372, 0.4873296211098421)),
 (15, (1390, 0.480168872177245)),
 (16, (480, 0.47998296797550716)),
 (17, (585, 0.4750745154943044)),
 (18, (1165, 0.46100595535440725)),
 (19, (1186, 0.4546615874334242)),
 (20, (950, 0.45110843722812555)),
 (21, (474, 0.4509160266023732)),
 (22, (888, 0.4481877717897356)),
 (23, (132, 0.44137183565088334)),
 (24, (938, 0.44051459281235233)),
 (25, (139, 0.4379952261203795)),
 (26, (232, 0.4379952261203795)),
 (27, (332, 0.4379952261203795)),
 (28, (137, 0.4315165673786807)),
 (29, (1182, 0.42590107379366315))