In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load movies and ratings data
movies = pd.read_csv('Predictive model data/movies_title_reformatted.csv')
ratings = pd.read_csv('Predictive model data/ratings.csv')

In [19]:

# Pivot ratings data to create a movies-user matrix
moviesdb = ratings.pivot(index='movieId', columns='userId', values='rating')
moviesdb.fillna(0, inplace=True)

# Filter movies and users based on a threshold
nouser = ratings.groupby('movieId')['rating'].agg('count')
nomovies = ratings.groupby('userId')['rating'].agg('count')
moviesdb = moviesdb.loc[:, nomovies[nomovies > 50].index]
moviesdb = moviesdb.loc[nouser[nouser > 10].index]

# Convert the index to strings
moviesdb.index = moviesdb.index.astype(str)

# Create a sparse matrix
csr_data = csr_matrix(moviesdb.values)

# Reset the index
moviesdb.reset_index(inplace=True)

# Initialize KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

# Separate the y variable, the labels
y = ratings['rating']

# Separate the X variable, the features
X = ratings.drop(columns=['rating'])

# Perform feature scaling (standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Initialize KNN regressor
knn_regressor = KNeighborsRegressor(n_neighbors=10)

# Fit the model to the training data
knn_regressor.fit(X_train, y_train)

# Make predictions on the test data
predictions = knn_regressor.predict(X_test)

# Calculate and print mean squared error
mse = mean_squared_error(predictions, y_test)
rsq_score = r2_score(predictions, y_test)
# acc = accuracy_score(predictions, y_test)
print(f"Mean Squared Error: {mse}")
print(f"R-squared score: {rsq_score}")

Mean Squared Error: 0.9285050973858542
R-squared score: 0.15201600362427115


In [3]:

def getrecs(movie):
#def getrecs(movieId):
    moviestorec = 10

    # Find the movieId of the given movie title
    mask = movies['title_reformatted'].str.upper() == movie.upper()
    movie_df = movies.loc[mask, 'title_reformatted']

    if movie_df.empty:
        print("Please check the spelling of the movie title or the movie may not be in our database :(")
        return

    movie_id = movies.loc[mask, 'movieId'].values
    movie_name = str(movie_df.values[0])

    if len(movie_id) > 0:
        movie_id = movie_id[0]

        # Check if the movieId exists in the moviesdb DataFrame
        if str(movie_id) in moviesdb['movieId'].values:
            movieindex = moviesdb[moviesdb['movieId'] == str(movie_id)].index
            distances, indices = knn.kneighbors(csr_data[movieindex], n_neighbors=moviestorec + 1)
            recmovie = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
            recframe = []

            for val in recmovie:
                movieindex = moviesdb['movieId'].iloc[val[0]]
                released_year = int(movies[movies['movieId'] == int(movieindex)]['released_year'].values[0])
                recframe.append({
                    'Title': movies[movies['movieId'] == int(movieindex)]['title_reformatted'].values[0],
                    'Released Year': released_year,
                    'Distance': val[1]
                })
            rec_df = pd.DataFrame(recframe, index=range(1, moviestorec + 1))
            rec_df.sort_values(by='Distance', inplace=True)
            rec_df.reset_index(drop=True, inplace=True)
            print(f"If you enjoyed {movie_name} ({released_year}), here are the top 10 movies we think you'll also enjoy!")
            return rec_df
            #return rec_df[["Title", "Released Year"]]
        else:
            return print('There are not enough ratings for this movie.')
    else:
        return print('You get nothing you lose. Good day Sir!')

In [4]:
# Example usage of getrecs
getrecs('Rise of the Planet of the Ape')

Please check the spelling of the movie title or the movie may not be in our database :(


In [5]:
getrecs('Rise of the Planet of the Apes')

If you enjoyed Rise of the Planet of the Apes (2014), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Dawn of the Planet of the Apes,2014,0.305179
1,Ted,2012,0.39285
2,Mad Max: Fury Road,2015,0.409615
3,Thor,2011,0.427129
4,Star Trek Into Darkness,2013,0.427842
5,X-Men: First Class,2011,0.452307
6,Jurassic World,2015,0.455368
7,The Cabin in the Woods,2012,0.468253
8,The Amazing Spider-Man,2012,0.485955
9,The A-Team,2010,0.486012


In [6]:
getrecs('IRON man')

If you enjoyed Iron Man (2012), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,The Avengers,2012,0.285319
1,The Dark Knight,2008,0.285835
2,WALL-E,2008,0.298138
3,Iron Man 2,2010,0.307492
4,Avatar,2009,0.310893
5,Batman Begins,2005,0.362759
6,Star Trek,2009,0.366029
7,Watchmen,2009,0.368558
8,Guardians of the Galaxy,2014,0.368758
9,Up,2009,0.368857


In [7]:
getrecs('inCeption')

If you enjoyed Inception (2008), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,The Dark Knight,2008,0.213876
1,Inglourious Basterds,2009,0.305288
2,The Dark Knight Rises,2012,0.335075
3,The Avengers,2012,0.340302
4,Shutter Island,2010,0.345888
5,Django Unchained,2012,0.362976
6,Sherlock Holmes,2009,0.366418
7,Fight Club,1999,0.367898
8,Iron Man,2008,0.369175
9,The Hangover,2009,0.369214


In [8]:
getrecs('Kingsman: The Secret Service')

If you enjoyed Kingsman: The Secret Service (2015), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Mad Max: Fury Road,2015,0.316481
1,Ted,2012,0.357116
2,Deadpool,2016,0.375957
3,Ant-Man,2015,0.377068
4,The Other Guys,2010,0.419232
5,The Man from U.N.C.L.E.,2015,0.42348
6,John Wick,2014,0.42585
7,Guardians of the Galaxy,2014,0.428111
8,Jurassic World,2015,0.434833
9,Big Hero 6,2014,0.436315


In [9]:
getrecs('Harry Potter and the Chamber of Secrets')

If you enjoyed Harry Potter and the Chamber of Secrets (2001), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Harry Potter and the Sorcerer's Stone (a.k.a. ...,2001,0.196221
1,Harry Potter and the Prisoner of Azkaban,2004,0.208909
2,Harry Potter and the Goblet of Fire,2005,0.265915
3,Harry Potter and the Order of the Phoenix,2007,0.346729
4,Pirates of the Caribbean: Dead Man's Chest,2006,0.349314
5,Pirates of the Caribbean: The Curse of the Bla...,2003,0.367197
6,The Lord of the Rings: The Two Towers,2002,0.391141
7,Harry Potter and the Half-Blood Prince,2009,0.394569
8,Ice Age,2002,0.397131
9,Spider-Man,2002,0.398373


In [10]:
getrecs('Up')

If you enjoyed Up (2008), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,WALL-E,2008,0.259324
1,Avatar,2009,0.289607
2,Sherlock Holmes,2009,0.357785
3,Toy Story 3,2010,0.358239
4,Iron Man,2008,0.368857
5,Inception,2010,0.370812
6,The Dark Knight,2008,0.374999
7,The Avengers,2012,0.399443
8,The Hangover,2009,0.401675
9,The Dark Knight Rises,2012,0.404911


In [11]:
getrecs('Avatar')

If you enjoyed Avatar (2009), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Up,2009,0.289607
1,WALL-E,2008,0.306969
2,District 9,2009,0.309947
3,Iron Man,2008,0.310893
4,Kung Fu Panda,2008,0.358604
5,The Dark Knight,2008,0.358937
6,The Hangover,2009,0.36419
7,I Am Legend,2007,0.389856
8,Inception,2010,0.393521
9,Zombieland,2009,0.39818


In [12]:
getrecs('Despicable Me')

If you enjoyed Despicable Me (2008), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Kung Fu Panda,2008,0.369476
1,How to Train Your Dragon,2010,0.39295
2,Harry Potter and the Half-Blood Prince,2009,0.4216
3,Megamind,2010,0.431181
4,Iron Man 2,2010,0.440971
5,The Amazing Spider-Man,2012,0.444662
6,Star Wars: Episode VII - The Force Awakens,2015,0.447232
7,Guardians of the Galaxy,2014,0.447753
8,Harry Potter and the Deathly Hallows: Part 2,2011,0.451612
9,The Avengers,2012,0.45607


**Remove duplicates in dataset so that it doesn't return an unexpected result? Getrecs('The Avengers') returns a movie from 1998.**

In [13]:
getrecs('the avengers')

If you enjoyed The Avengers (2014), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Guardians of the Galaxy,2014,0.241876
1,Iron Man,2008,0.285319
2,The Dark Knight Rises,2012,0.293238
3,Iron Man 2,2010,0.304975
4,X-Men: First Class,2011,0.32161
5,X-Men: Days of Future Past,2014,0.332192
6,Inception,2010,0.340302
7,Captain America: The Winter Soldier,2014,0.357826
8,Star Wars: Episode VII - The Force Awakens,2015,0.364373
9,Edge of Tomorrow,2014,0.371589


In [14]:
getrecs('Django Unchained')

If you enjoyed Django Unchained (2009), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Inglourious Basterds,2009,0.312611
1,Interstellar,2014,0.326071
2,The Dark Knight Rises,2012,0.353634
3,The Grand Budapest Hotel,2014,0.361227
4,Mad Max: Fury Road,2015,0.361928
5,Inception,2010,0.362976
6,The Hangover,2009,0.407104
7,Up,2009,0.411537
8,The Revenant,2015,0.421573
9,Looper,2012,0.425359


In [15]:
getrecs('Moana')

There are not enough ratings for this movie.


In [16]:
getrecs('Batman')

If you enjoyed Batman (1994), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,True Lies,1994,0.300079
1,Batman Forever,1995,0.305549
2,Terminator 2: Judgment Day,1991,0.310221
3,Jurassic Park,1993,0.329667
4,The Fugitive,1993,0.349344
5,The Mask,1994,0.366181
6,Braveheart,1995,0.387096
7,Speed,1994,0.388161
8,Die Hard: With a Vengeance,1995,0.390254
9,Aladdin,1992,0.392825


**Scrap Work - working previous version**

In [17]:
# def getrecs(movie):
#     moviestorec = 10

#     # Find the movieId of the given movie title
#     mask = movies['title_reformatted'].str.upper() == movie.upper()
#     movie_df = movies.loc[mask, 'title_reformatted']

#     if movie_df.empty:
#         print("Please check the spelling of the movie title or the movie may not be in our database :(")
#         return

#     movie_id = movies.loc[mask, 'movieId'].values
#     movie_name = str(movie_df.values[0])  # Convert to string

#     if len(movie_id) > 0:
#         movie_id = movie_id[0]

#         # Check if the movieId exists in the moviesdb DataFrame
#         if str(movie_id) in moviesdb['movieId'].values:
#             movieindex = moviesdb[moviesdb['movieId'] == str(movie_id)].index
#             distances, indices = knn.kneighbors(csr_data[movieindex], n_neighbors=moviestorec + 1)
#             recmovie = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
#             recframe = []

#             for val in recmovie:
#                 movieindex = moviesdb['movieId'].iloc[val[0]]
#                 released_year = int(movies[movies['movieId'] == int(movieindex)]['released_year'].values[0])
#                 recframe.append({
#                     'Title': movies[movies['movieId'] == int(movieindex)]['title_reformatted'].values[0],
#                     'Released Year': released_year,
#                     'Distance': val[1]
#                 })
#             df = pd.DataFrame(recframe, index=range(1, moviestorec + 1))
#             print(f"If you enjoyed {movie_name} ({released_year}), here are the top 10 movies we think you'll also enjoy!")
#             return df
#         else:
#             return print('There are not enough ratings for this movie.')
#     else:
#         return print('You get nothing you lose. Good day Sir!')

# # Example usage of getrecs
# getrecs('The other Guy')
