In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split  # Import the missing module

# Load movies and ratings data
movies = pd.read_csv('Predictive model data/movies_title_reformatted.csv')
ratings = pd.read_csv('Predictive model data/ratings.csv')

**Check with Beatrice on prediction model**

In [3]:

# Pivot ratings data to create a movies-user matrix
moviesdb = ratings.pivot(index='movieId', columns='userId', values='rating')
moviesdb.fillna(0, inplace=True)
# Filter movies and users based on a threshold
nouser = ratings.groupby('movieId')['rating'].agg('count')
nomovies = ratings.groupby('userId')['rating'].agg('count')
moviesdb = moviesdb.loc[:, nomovies[nomovies > 50].index]
moviesdb = moviesdb.loc[nouser[nouser > 10].index]

# Convert the index to strings
moviesdb.index = moviesdb.index.astype(str)

# Create a sparse matrix
csr_data = csr_matrix(moviesdb.values)

# Reset the index
moviesdb.reset_index(inplace=True)

# Initialize KNN model
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
knn.fit(csr_data)

# Separate the y variable, the labels
y = ratings['rating']

# Separate the X variable, the features
X = ratings.drop(columns=['rating'])

# Perform feature scaling (standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)

# Initialize KNN regressor
knn_regressor = KNeighborsRegressor(n_neighbors=10)

# Fit the model to the training data
knn_regressor.fit(X_train, y_train)

# Make predictions on the test data
predictions = knn_regressor.predict(X_test)

# Calculate and print mean squared error
mse = mean_squared_error(predictions, y_test)
print(f"Mean Squared Error (Accuracy Score): {mse}")

Mean Squared Error (Accuracy Score): 0.9285050973858542


In [4]:

def getrecs(movie):
    moviestorec = 10

    # Find the movieId of the given movie title
    mask = movies['title_reformatted'].str.upper() == movie.upper()
    movie_df = movies.loc[mask, 'title_reformatted']

    if movie_df.empty:
        print("Please check the spelling of the movie title or the movie may not be in our database :(")
        return

    movie_id = movies.loc[mask, 'movieId'].values
    movie_name = str(movie_df.values[0])  # Convert to string

    if len(movie_id) > 0:
        movie_id = movie_id[0]

        # Check if the movieId exists in the moviesdb DataFrame
        if str(movie_id) in moviesdb['movieId'].values:
            movieindex = moviesdb[moviesdb['movieId'] == str(movie_id)].index
            distances, indices = knn.kneighbors(csr_data[movieindex], n_neighbors=moviestorec + 1)
            recmovie = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
            recframe = []

            for val in recmovie:
                movieindex = moviesdb['movieId'].iloc[val[0]]
                released_year = int(movies[movies['movieId'] == int(movieindex)]['released_year'].values[0])
                recframe.append({
                    'Title': movies[movies['movieId'] == int(movieindex)]['title_reformatted'].values[0],
                    'Released Year': released_year,
                    'Distance': val[1]
                })
            rec_df = pd.DataFrame(recframe, index=range(1, moviestorec + 1))
            rec_df.sort_values(by='Distance', inplace=True)
            rec_df.reset_index(drop=True, inplace=True)
            print(f"If you enjoyed {movie_name} ({released_year}), here are the top 10 movies we think you'll also enjoy!")
            return rec_df
        else:
            return print('There are not enough ratings for this movie.')
    else:
        return print('You get nothing you lose. Good day Sir!')

In [5]:
# Example usage of getrecs
getrecs('The other Guy')

Please check the spelling of the movie title or the movie may not be in our database :(


In [6]:
getrecs('The other Guys')

If you enjoyed The Other Guys (2012), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Ted,2012,0.328445
1,Step Brothers,2008,0.378847
2,Neighbors,2014,0.379019
3,Zack and Miri Make a Porno,2008,0.392287
4,Kingsman: The Secret Service,2015,0.419232
5,Kick-Ass 2,2013,0.420392
6,"I Love You, Man",2009,0.451622
7,Hot Tub Time Machine,2010,0.454315
8,Role Models,2008,0.455296
9,Harold & Kumar Escape from Guantanamo Bay,2008,0.464452


In [7]:
getrecs('IRON man')

If you enjoyed Iron Man (2012), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,The Avengers,2012,0.285319
1,The Dark Knight,2008,0.285835
2,WALL·E,2008,0.298138
3,Iron Man 2,2010,0.307492
4,Avatar,2009,0.310893
5,Batman Begins,2005,0.362759
6,Star Trek,2009,0.366029
7,Watchmen,2009,0.368558
8,Guardians of the Galaxy,2014,0.368758
9,Up,2009,0.368857


In [8]:
getrecs('inCeption')

If you enjoyed Inception (2008), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,The Dark Knight,2008,0.213876
1,Inglourious Basterds,2009,0.305288
2,The Dark Knight Rises,2012,0.335075
3,The Avengers,2012,0.340302
4,Shutter Island,2010,0.345888
5,Django Unchained,2012,0.362976
6,Sherlock Holmes,2009,0.366418
7,Fight Club,1999,0.367898
8,Iron Man,2008,0.369175
9,The Hangover,2009,0.369214


In [9]:
getrecs('Ted')

If you enjoyed Ted (2014), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Neighbors,2014,0.296747
1,The Other Guys,2010,0.328445
2,Kingsman: The Secret Service,2015,0.357116
3,Rise of the Planet of the Apes,2011,0.39285
4,Kick-Ass 2,2013,0.394764
5,Zack and Miri Make a Porno,2008,0.418579
6,Spy,2015,0.419628
7,A Million Ways to Die in the West,2014,0.428084
8,Dawn of the Planet of the Apes,2014,0.431285
9,The Secret Life of Walter Mitty,2013,0.435959


In [10]:
getrecs('Harry Potter and the Chamber of Secrets')

If you enjoyed Harry Potter and the Chamber of Secrets (2001), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Harry Potter and the Sorcerer's Stone (a.k.a. ...,2001,0.196221
1,Harry Potter and the Prisoner of Azkaban,2004,0.208909
2,Harry Potter and the Goblet of Fire,2005,0.265915
3,Harry Potter and the Order of the Phoenix,2007,0.346729
4,Pirates of the Caribbean: Dead Man's Chest,2006,0.349314
5,Pirates of the Caribbean: The Curse of the Bla...,2003,0.367197
6,The Lord of the Rings: The Two Towers,2002,0.391141
7,Harry Potter and the Half-Blood Prince,2009,0.394569
8,Ice Age,2002,0.397131
9,Spider-Man,2002,0.398373


In [11]:
getrecs('Up')

If you enjoyed Up (2008), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,WALL·E,2008,0.259324
1,Avatar,2009,0.289607
2,Sherlock Holmes,2009,0.357785
3,Toy Story 3,2010,0.358239
4,Iron Man,2008,0.368857
5,Inception,2010,0.370812
6,The Dark Knight,2008,0.374999
7,The Avengers,2012,0.399443
8,The Hangover,2009,0.401675
9,The Dark Knight Rises,2012,0.404911


In [13]:
getrecs('Avatar')

If you enjoyed Avatar (2009), here are the top 10 movies we think you'll also enjoy!


Unnamed: 0,Title,Released Year,Distance
0,Up,2009,0.289607
1,WALL·E,2008,0.306969
2,District 9,2009,0.309947
3,Iron Man,2008,0.310893
4,Kung Fu Panda,2008,0.358604
5,The Dark Knight,2008,0.358937
6,The Hangover,2009,0.36419
7,I Am Legend,2007,0.389856
8,Inception,2010,0.393521
9,Zombieland,2009,0.39818


**Scrap Work**

In [12]:
# def getrecs(movie):
#     moviestorec = 10

#     # Find the movieId of the given movie title
#     mask = movies['title_reformatted'].str.upper() == movie.upper()
#     movie_df = movies.loc[mask, 'title_reformatted']

#     if movie_df.empty:
#         print("Please check the spelling of the movie title or the movie may not be in our database :(")
#         return

#     movie_id = movies.loc[mask, 'movieId'].values
#     movie_name = str(movie_df.values[0])  # Convert to string

#     if len(movie_id) > 0:
#         movie_id = movie_id[0]

#         # Check if the movieId exists in the moviesdb DataFrame
#         if str(movie_id) in moviesdb['movieId'].values:
#             movieindex = moviesdb[moviesdb['movieId'] == str(movie_id)].index
#             distances, indices = knn.kneighbors(csr_data[movieindex], n_neighbors=moviestorec + 1)
#             recmovie = sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
#             recframe = []

#             for val in recmovie:
#                 movieindex = moviesdb['movieId'].iloc[val[0]]
#                 released_year = int(movies[movies['movieId'] == int(movieindex)]['released_year'].values[0])
#                 recframe.append({
#                     'Title': movies[movies['movieId'] == int(movieindex)]['title_reformatted'].values[0],
#                     'Released Year': released_year,
#                     'Distance': val[1]
#                 })
#             df = pd.DataFrame(recframe, index=range(1, moviestorec + 1))
#             print(f"If you enjoyed {movie_name} ({released_year}), here are the top 10 movies we think you'll also enjoy!")
#             return df
#         else:
#             return print('There are not enough ratings for this movie.')
#     else:
#         return print('You get nothing you lose. Good day Sir!')

# # Example usage of getrecs
# getrecs('The other Guy')
