<a href="https://colab.research.google.com/github/prince02356/movie_recommendation_system/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Dataset Preparation





2. Installing Required Libraries

In [None]:
!pip install pandas numpy scikit-learn tensorflow surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357280 sha256=dbf6bb0cb439fee73a5b519fa76a9573a8fc33f140ead2d8810c369cafbaa262
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

3. Importing Libraries and Mounting to Google Drive

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


4. Loading and Preprocessing Data

In [None]:
# Load data (adjust file path to your dataset's location in Google Drive)
movies = pd.read_csv('/content/drive/MyDrive/movie_dataset.csv')
# Basic preprocessing
movies = movies.dropna(subset=['title', 'genres', 'overview'])

5. Content-Based Filtering Using TF-IDF

In [None]:
# Use TF-IDF for 'overview' column
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['overview'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

6. Building a Recommendation Function

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = movies[movies['title'] == title].index[0]

    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies by similarity score
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

# Test the function
print(get_recommendations("Toy Story"))

42                 Toy Story 3
343                Toy Story 2
1779    The 40 Year Old Virgin
2869    For Your Consideration
891            Man on the Moon
3873             Class of 1984
3379              Factory Girl
3065                Heartbeeps
3383                 Losin' It
2569               Match Point
Name: title, dtype: object


7. Enhanced Model Using Neural Collaborative Filtering (NCF)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Flatten, Concatenate, Dense, Input
import numpy as np

# Example with basic user-item interaction data
user_ids = np.array([1, 2, 3, 4, 5])  # Replace with real user IDs
movie_ids = np.array([285, 206647, 49026, 49529, 559])  # Replace with real movie IDs
ratings = np.array([6.9, 6.3, 7.6, 6.1, 5.9])  # Replace with real ratings

# Get unique user and movie IDs
unique_user_ids = np.unique(user_ids)
unique_movie_ids = np.unique(movie_ids)

# Map original IDs to sequential indices
user_id_mapping = {id: index for index, id in enumerate(unique_user_ids)}
movie_id_mapping = {id: index for index, id in enumerate(unique_movie_ids)}

# Convert original IDs to mapped indices
user_ids_mapped = np.array([user_id_mapping[id] for id in user_ids])
movie_ids_mapped = np.array([movie_id_mapping[id] for id in movie_ids])


# Model architecture
user_input = Input(shape=(1,))
movie_input = Input(shape=(1,))

# Update input_dim to the number of unique users and movies
user_embedding = Embedding(input_dim=len(unique_user_ids), output_dim=10)(user_input)
movie_embedding = Embedding(input_dim=len(unique_movie_ids), output_dim=10)(movie_input)

user_vec = Flatten()(user_embedding)
movie_vec = Flatten()(movie_embedding)

concat = Concatenate()([user_vec, movie_vec])
dense = Dense(128, activation='relu')(concat)
output = Dense(1)(dense)

model = Model([user_input, movie_input], output)
model.compile(optimizer='adam', loss='mse')

# Fit the model using the mapped IDs
model.fit([user_ids_mapped, movie_ids_mapped], ratings, epochs=5)

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 43.5927
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step - loss: 43.4347
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - loss: 43.2802
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 43.1297
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 42.9851


<keras.src.callbacks.history.History at 0x7a85b9c6de10>