In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


# Read and validate the data
df = pd.read_csv('movies.csv')
print(f"Loaded {len(df)} movies")

# Verify required columns exist
required_columns = ['movieId', 'title', 'genres']
if not all(col in df.columns for col in required_columns):
    raise ValueError(f"Missing required columns. Expected: {required_columns}")

# Check for missing values
if df['genres'].isnull().any():
    print("Warning: Found missing genre values")
    df['genres'] = df['genres'].fillna('')

# Create feature matrix from genres
cv = CountVectorizer()
genre_matrix = cv.fit_transform(df['genres'].str.replace('|', ' '))

# Print some information about the features
print(f"Number of unique genres: {len(cv.get_feature_names_out())}")
print("Genres found:", ', '.join(cv.get_feature_names_out()))

# Initialize and fit KNN model
n_neighbors = 6  # 5 recommendations + the movie itself
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
knn.fit(genre_matrix)

# Save the processed data and models
import pickle
data = {
    'df': df,
    'knn': knn,
    'genre_matrix': genre_matrix,
    'cv': cv
}
with open('movie_data.pkl', 'wb') as f:
    pickle.dump(data, f)

Loaded 62423 movies
Number of unique genres: 24
Genres found: action, adventure, animation, children, comedy, crime, documentary, drama, fantasy, fi, film, genres, horror, imax, listed, musical, mystery, no, noir, romance, sci, thriller, war, western


In [8]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
# Create feature matrix from genres
cv = CountVectorizer()
genre_matrix = cv.fit_transform(df['genres'].str.replace('|', ' '))

# Initialize and fit KNN model
n_neighbors = 6  # 5 recommendations + the movie itself
knn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')
knn.fit(genre_matrix)

# Save the processed data and models
import pickle
data = {
    'df': df,
    'knn': knn,
    'genre_matrix': genre_matrix,
    'cv': cv
}
with open('movie_data.pkl', 'wb') as f:
    pickle.dump(data, f)