In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse
import os

# Set plotting style
try:
    plt.style.use('seaborn-v0_8')  # Updated style
except OSError:
    plt.style.use('default')  # Fallback to default
    print("Warning: 'seaborn-v0_8' style not found, using 'default' style.")

# Create output directory
if not os.path.exists('output'):
    os.makedirs('output')

# Load datasets
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Display sample data
print('Movies head:')
print(movies.head())
print('Ratings head:')
print(ratings.head())

# Print sample movie titles
print('Sample movie titles:', movies['title'].head(10).tolist())

# Data cleaning
movies['genres'] = movies['genres'].fillna('(no genres listed)')
movies['tags'] = movies['genres'].str.replace('|', ' ')
movies['title'] = movies['title'].str.strip()
new_data = movies[['movieId', 'title', 'tags']].drop_duplicates()

# Verify a known movie
test_movie = new_data['title'].iloc[0]
print(f'Testing with movie: {test_movie}')
if test_movie in new_data['title'].values:
    print(f"'{test_movie}' found in dataset.")
else:
    print(f"'{test_movie}' NOT found.")

# Fuzzy match example
close_matches = difflib.get_close_matches(test_movie, new_data['title'], n=3, cutoff=0.8)
print(f"Close matches for '{test_movie}': {close_matches}")

# Verify user ID 1
if 1 in ratings['userId'].unique():
    print('User ID found in dataset.')
else:
    print('User ID NOT found.')

# Content-based filtering: TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new_data['tags'].values.astype('U')).toarray()
similarity = cosine_similarity(vector)

# Save content-based model
with open('output/movies_list.pkl', 'wb') as f:
    pickle.dump(new_data, f)
with open('output/similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)

# Collaborative filtering: Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train KNN model (user-based, cosine similarity)
sim_options = {'name': 'cosine', 'user_based': True}
algo = KNNBasic(k=20, sim_options=sim_options)
algo.fit(trainset)

# Evaluate RMSE
predictions = algo.test(testset)
rmse_score = rmse(predictions, verbose=False)
print(f'RMSE: {rmse_score:.4f}')

# Precision@K
def precision_at_k(algo, testset, k=5, threshold=4.0):
    user_recs = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in user_recs:
            user_recs[uid] = []
        user_recs[uid].append((iid, est, true_r))
    
    precisions = []
    for uid in user_recs:
        recs = sorted(user_recs[uid], key=lambda x: x[1], reverse=True)[:k]
        relevant = sum(true_r >= threshold for _, _, true_r in recs)
        precisions.append(relevant / k if k > 0 else 0)
    return sum(precisions) / len(precisions) if precisions else 0

precision = precision_at_k(algo, predictions)
print(f'Precision@5: {precision:.4f}')

# Save Surprise model
with open('output/surprise_model.pkl', 'wb') as f:
    pickle.dump(algo, f)
with open('output/trainset.pkl', 'wb') as f:
    pickle.dump(trainset, f)

# Hybrid recommendation function
def hybrid_recommend(user_id, movie_title, n=5):
    try:
        close_matches = difflib.get_close_matches(movie_title, new_data['title'], n=1, cutoff=0.8)
        if not close_matches:
            close = difflib.get_close_matches(movie_title, new_data['title'], n=3, cutoff=0.6)
            return f"Movie '{movie_title}' not found. Closest matches: {close if close else 'None'}"
        movie_title = close_matches[0]
        movie_idx = new_data[new_data['title'] == movie_title].index[0]
        
        # Content-based
        distance = sorted(list(enumerate(similarity[movie_idx])), reverse=True, key=lambda x: x[1])[1:n+10]
        similar_movies = [new_data.iloc[i[0]] for i in distance]
        
        # Collaborative
        if user_id not in trainset._raw2inner_id_users:
            return f"User ID {user_id} not found."
        recommendations = []
        for movie in similar_movies:
            movie_id = movie['movieId']
            if movie_id in trainset._raw2inner_id_items:
                pred = algo.predict(user_id, movie_id)
                recommendations.append((movie['title'], pred.est))
        
        recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)[:n]
        return recommendations if recommendations else "No recommendations available."
    except Exception as e:
        return f"Error: {str(e)}"

# Test recommendation
print(f'Recommended movies for User 1 based on {test_movie}:')
result = hybrid_recommend(1, test_movie, n=5)
print(result)

# Genre distribution visualization
genres = movies['genres'].str.split('|', expand=True).stack().str.strip().value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=genres.values, y=genres.index)
plt.title('Genre Distribution')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.tight_layout()
plt.savefig('output/genre_distribution.png')
plt.close()

# Save genre data
genres.to_csv('output/genre_distribution.csv')

Movies head:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Ratings head:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
Sample movie titles: ['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)