<a href="https://colab.research.google.com/github/rabindra20-git/movie_recommender/blob/main/Movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# Load data and preprocess
!curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

movie_names = movies_df.set_index('movieId')['title'].to_dict()

class Loader(tf.keras.utils.Sequence):
    def __init__(self, batch_size=128):
        self.ratings = ratings_df.copy()

        users = self.ratings.userId.unique()
        movies = self.ratings.movieId.unique()

        self.userid2idx = {o:i for i,o in enumerate(users)}
        self.movieid2idx = {o:i for i,o in enumerate(movies)}
        self.idx2userid = {i:o for o,i in self.userid2idx.items()}
        self.idx2movieid = {i:o for o,i in self.movieid2idx.items()}

        self.ratings.movieId = ratings_df.movieId.apply(lambda x: self.movieid2idx[x])
        self.ratings.userId = ratings_df.userId.apply(lambda x: self.userid2idx[x])

        self.x = self.ratings.drop(['rating', 'timestamp'], axis=1).values
        self.y = self.ratings['rating'].values
        self.batch_size = batch_size
        self.indexes = np.arange(len(self.ratings))

    def __len__(self):
        return int(np.ceil(len(self.ratings) / self.batch_size))

    def __getitem__(self, index):
        start_idx = index * self.batch_size
        end_idx = (index + 1) * self.batch_size
        batch_x = self.x[start_idx:end_idx]
        batch_y = self.y[start_idx:end_idx]
        return batch_x, batch_y

# Define the MatrixFactorization model using Keras API
class MatrixFactorization(tf.keras.Model):
    def __init__(self, n_users, n_items, n_factors=20):
        super(MatrixFactorization, self).__init__()
        self.user_factors = tf.keras.layers.Embedding(n_users, n_factors,
                                                     embeddings_initializer=tf.keras.initializers.RandomUniform(0, 0.05))
        self.item_factors = tf.keras.layers.Embedding(n_items, n_factors,
                                                     embeddings_initializer=tf.keras.initializers.RandomUniform(0, 0.05))

    def call(self, inputs):
        users, items = inputs[:, 0], inputs[:, 1]
        user_embeddings = self.user_factors(users)
        item_embeddings = self.item_factors(items)
        return tf.reduce_sum(tf.multiply(user_embeddings, item_embeddings), axis=1)

# Instantiate the MatrixFactorization model
n_users = len(ratings_df.userId.unique())
n_items = len(ratings_df.movieId.unique())
model = MatrixFactorization(n_users, n_items, n_factors=8)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), loss='mse')

# Train the model
num_epochs = 128
train_set = Loader()
model.fit(train_set, epochs=num_epochs, verbose=1)

# Extract trained movie embeddings
trained_movie_embeddings = model.item_factors.embeddings.numpy()

# Perform KMeans clustering using scikit-learn
kmeans = KMeans(n_clusters=10, random_state=0).fit(trained_movie_embeddings)


In [None]:
# Print top-rated movies in each cluster
for cluster in range(10):
    print("Cluster #{}".format(cluster))
    cluster_movies = np.where(kmeans.labels_ == cluster)[0]

    # Get movie indices sorted by ratings in descending order
    sorted_movies = sorted(cluster_movies, key=lambda movidx: ratings_df[ratings_df['movieId'] == train_set.idx2movieid[movidx]]['rating'].mean(), reverse=True)

    # Print top 10 movies in the cluster
    for i, movidx in enumerate(sorted_movies[:10]):
        movid = train_set.idx2movieid[movidx]
        rat_count = ratings_df.loc[ratings_df['movieId'] == movid].count()[0]
        avg_rating = ratings_df[ratings_df['movieId'] == movid]['rating'].mean()
        print("\t{} (Avg. Rating: {:.2f}, Ratings: {})".format(movie_names[movid], avg_rating, rat_count))


Cluster #0
	Thief (1981) (Avg. Rating: 4.50, Ratings: 2)
	Last Tango in Paris (Ultimo tango a Parigi) (1972) (Avg. Rating: 4.50, Ratings: 5)
	Stunt Man, The (1980) (Avg. Rating: 4.50, Ratings: 3)
	Europa Europa (Hitlerjunge Salomon) (1990) (Avg. Rating: 4.33, Ratings: 3)
	Love & Mercy (2014) (Avg. Rating: 4.33, Ratings: 3)
	Happiness of the Katakuris, The (Katakuri-ke no kôfuku) (2001) (Avg. Rating: 4.33, Ratings: 3)
	Once Upon a Time in the West (C'era una volta il West) (1968) (Avg. Rating: 4.31, Ratings: 18)
	Voices from the List (2004) (Avg. Rating: 4.30, Ratings: 5)
	Outlaw Josey Wales, The (1976) (Avg. Rating: 4.25, Ratings: 18)
	Wild Tales (2014) (Avg. Rating: 4.25, Ratings: 10)
Cluster #1
	Welcome to Woop-Woop (1997) (Avg. Rating: 4.00, Ratings: 2)
	Return to Never Land (2002) (Avg. Rating: 4.00, Ratings: 1)
	Fantasticks, The (1995) (Avg. Rating: 4.00, Ratings: 2)
	Butcher Boy, The (1997) (Avg. Rating: 4.00, Ratings: 2)
	Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evan

In [None]:
# Function to get recommendations for a given movie title
def get_recommendations(movie_title):
    movie_id = movies_df[movies_df['title'] == movie_title]['movieId'].values[0]
    movie_idx = train_set.movieid2idx[movie_id]
    cluster_label = kmeans.labels_[movie_idx]

    cluster_movies = np.where(kmeans.labels_ == cluster_label)[0]
    sorted_movies = sorted(cluster_movies, key=lambda movidx: ratings_df[ratings_df['movieId'] == train_set.idx2movieid[movidx]]['rating'].mean(), reverse=True)

    recommendations = []
    for i, movidx in enumerate(sorted_movies):
        if len(recommendations) >= 10:
            break
        movid = train_set.idx2movieid[movidx]
        if movid != movie_id:  # Exclude the same movie
            recommendations.append(movie_names[movid])

    return recommendations

from difflib import get_close_matches

# Input partial movie title
partial_title = "Toy Story"

# Find the closest matching movie title
close_match = get_close_matches(partial_title, movie_names.values(), n=1)[0]

# Print the closest matching movie title
print("Closest Match:", close_match)

# Get recommendations for the closest matching movie title
recommendations = get_recommendations(close_match)

print("\nRecommended movies for '{}':".format(close_match))
for i, movie in enumerate(recommendations, start=1):
    movie_id = movies_df[movies_df['title'] == movie]['movieId'].values[0]
    avg_rating = ratings_df[ratings_df['movieId'] == movie_id]['rating'].mean()
    print("{}. {} (Avg. Rating: {:.2f})".format(i, movie, avg_rating))



Closest Match: Toy Story (1995)

Recommended movies for 'Toy Story (1995)':
1. Lesson Faust (1994) (Avg. Rating: 5.00)
2. When Worlds Collide (1951) (Avg. Rating: 5.00)
3. Madame Sousatzka (1988) (Avg. Rating: 5.00)
4. Jane Eyre (1944) (Avg. Rating: 5.00)
5. Colourful (Karafuru) (2010) (Avg. Rating: 5.00)
6. 20 Million Miles to Earth (1957) (Avg. Rating: 5.00)
7. American Friend, The (Amerikanische Freund, Der) (1977) (Avg. Rating: 5.00)
8. 7 Faces of Dr. Lao (1964) (Avg. Rating: 5.00)
9. Dr. Goldfoot and the Bikini Machine (1965) (Avg. Rating: 5.00)
10. Into the Forest of Fireflies' Light (2011) (Avg. Rating: 5.00)
