In [4]:
import sys
sys.path.append('..')

from src.data_preprocessing import DataPreprocessor
from src.feature_engineering import FeatureEngineer
import pandas as pd
import pickle
from scipy.sparse import save_npz, csr_matrix
import numpy as np
import os

# Create data directory if it doesn't exist
os.makedirs('../data', exist_ok=True)

# Load your data
movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')

print("Original data shapes:")
print(f"Movies: {movies.shape}, Ratings: {ratings.shape}")

# Initialize and run preprocessing
preprocessor = DataPreprocessor(movies, ratings)
movies_processed, ratings_processed, user_item_matrix = preprocessor.get_preprocessed_data()

# Display results
print("\n=== Preprocessed Data Summary ===")
print(f"Processed movies: {len(movies_processed)}")
print(f"Processed ratings: {len(ratings_processed)}")

# Feature Engineering (only if dataset is not too large)
if len(movies_processed) < 10000:  # Only for smaller datasets
    feature_engineer = FeatureEngineer(movies_processed)
    genre_features, genre_names = feature_engineer.create_genre_features()
    tfidf_features = feature_engineer.create_content_features()
    
    # Convert to sparse matrices if they are dense arrays
    if isinstance(genre_features, np.ndarray):
        print("Converting genre features to sparse matrix...")
        genre_features = csr_matrix(genre_features)
    if isinstance(tfidf_features, np.ndarray):
        print("Converting TF-IDF features to sparse matrix...")
        tfidf_features = csr_matrix(tfidf_features)
        
else:
    print("Dataset too large for content features. Skipping feature engineering.")
    genre_features, tfidf_features = None, None

# Save processed data
print("\nSaving processed data...")
movies_processed.to_csv('../data/movies_processed.csv', index=False)
ratings_processed.to_csv('../data/ratings_processed.csv', index=False)

# Save sparse matrix efficiently
print("Saving user-item matrix...")
save_npz('../data/user_item_matrix_sparse.npz', user_item_matrix)

if genre_features is not None:
    print("Saving genre features...")
    save_npz('../data/genre_features_sparse.npz', genre_features)
    
if tfidf_features is not None:
    print("Saving TF-IDF features...")
    save_npz('../data/tfidf_features_sparse.npz', tfidf_features)

# Save encoders for later use
print("Saving encoders...")
with open('../data/user_encoder.pkl', 'wb') as f:
    pickle.dump(preprocessor.user_encoder, f)
    
with open('../data/movie_encoder.pkl', 'wb') as f:
    pickle.dump(preprocessor.movie_encoder, f)

# Save genre names if available
if 'genre_features' in locals() and genre_names is not None:
    with open('../data/genre_names.pkl', 'wb') as f:
        pickle.dump(genre_names, f)

print("All processed data saved successfully!")

Original data shapes:
Movies: (9742, 3), Ratings: (100836, 4)
Preprocessing movies data...
Found 20 unique genres: {'War', 'Film-Noir', 'Documentary', 'Fantasy', 'Romance', 'Thriller', 'Adventure', 'IMAX', 'Children', 'Musical', 'Sci-Fi', 'Horror', 'Crime', 'Mystery', 'Western', '(no genres listed)', 'Comedy', 'Action', 'Drama', 'Animation'}
Preprocessing ratings data...
After filtering: 81116 ratings
Creating sparse user-item matrix...
Sparse user-item matrix shape: (610, 2269)
Sparsity: 94.14%
Number of ratings: 81116
Matrix size: 1384090

=== Preprocessed Data Summary ===
Processed movies: 9742
Processed ratings: 81116
Creating genre features...
Created 20 genre features
Creating content features...
TF-IDF matrix shape: (9742, 500)
Converting genre features to sparse matrix...

Saving processed data...
Saving user-item matrix...
Saving genre features...
Saving TF-IDF features...
Saving encoders...
All processed data saved successfully!
