In [None]:
# Title of Project: Movies Recommendation System

# Objective: To build a machine learning model that recommends movies based on user preferences.

# Data Source: MovieLens dataset
# Download dataset: https://grouplens.org/datasets/movielens/

# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.model_selection import cross_validate

# Import Data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Describe Data
print(movies.info())
print(movies.head())
print(ratings.describe())
print(ratings.head())

# Data Visualization

# Rating distribution
sns.histplot(ratings['rating'], bins=10)
plt.title('Distribution of Movie Ratings')
plt.show()

# Number of movies per genre
movies['genres'].str.split('|').explode().value_counts().plot(kind='bar', figsize=(10, 5))
plt.title('Number of Movies per Genre')
plt.show()

# Data Preprocessing

# Drop duplicates if necessary
movies.drop_duplicates(inplace=True)
ratings.dropna(inplace=True)

# Define Target Variable (y) and Feature Variables (X)
# For content-based filtering, we use movie genres
X = movies['genres']  # Example for content-based filtering
y = ratings['rating']  # Example for collaborative filtering

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(ratings[['userId', 'movieId']], ratings['rating'], test_size=0.2, random_state=42)

# Modeling
## Content-Based Filtering using TF-IDF and Cosine Similarity
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])  # Genres as features
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Collaborative Filtering using SVD from Surprise
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.2)

svd_model = SVD()
svd_model.fit(trainset)

# Model Evaluation
# Evaluate Collaborative Filtering with RMSE
predictions = svd_model.test(testset)
rmse = mean_squared_error([pred.r_ui for pred in predictions], [pred.est for pred in predictions], squared=False)
print(f'Collaborative Filtering RMSE: {rmse}')

# Prediction

# Content-based filtering recommendation for a specific movie
def get_movie_recommendations(title, cosine_sim):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 similar movies
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]

print(get_movie_recommendations('Toy Story (1995)', cosine_sim))

# Collaborative filtering prediction for a specific user and movie
user_id = 1
movie_id = 10
predicted_rating = svd_model.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} on movie {movie_id}: {predicted_rating.est}")

# Explanation:
# Content-Based Filtering: Recommends movies similar to a chosen movie based on genres.
# Collaborative Filtering: Recommends movies by predicting ratings based on user-movie interactions.
