In [1]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
from utils.files_management import fix_csv_with_commas_in_text, load_netflix_data, load_multiple_netflix_files
from utils.data_processing import filter_sparse_users_and_movies, filter_valid_ratings, convert_columns_to_string
from utils.data_split import temporal_train_test_split

In [2]:
combined_data_1_path = RAW_DATA_PATH / "combined_data_1.txt"
combined_data_2_path = RAW_DATA_PATH / "combined_data_2.txt"
combined_data_3_path = RAW_DATA_PATH / "combined_data_3.txt"
combined_data_4_path = RAW_DATA_PATH / "combined_data_4.txt"

combined_data_path_list = [combined_data_1_path, combined_data_2_path, combined_data_3_path, combined_data_4_path]

concatenated_data = RAW_DATA_PATH / "data.parquet"

movie_titles_path = RAW_DATA_PATH / "movie_titles.csv"
movie_titles_fixed_path = RAW_DATA_PATH / "movie_titles_fixed.csv"

In [3]:
df = pd.read_parquet(concatenated_data)

# Preprocessing

In [None]:
#36.1 s
#57.7 s
df = convert_columns_to_string(df, ['customer_id', 'movie_id'])

In [None]:
# 3m 9.1s
# 3m 17s
def filter_valid_ratings(
    df: pd.DataFrame,
    min_rating: int =1, 
    max_rating: int =5
    ) -> pd.DataFrame:
    """
    Quickly filter a DataFrame to keep only rows where 'rating' is between 1 and 5 inclusive.

    Parameters:
        df (pd.DataFrame): Input DataFrame with a 'rating' column.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    return df[df['rating'].between(min_rating, max_rating)]

# All the ratings are suposed to be in between 1-5
min_rating = 1
max_rating = 5
df = filter_valid_ratings(df, min_rating=min_rating, max_rating=max_rating)

In [None]:
def filter_sparse_users_and_movies(
    df: pd.DataFrame, 
    min_movie_ratings: int = 50, 
    min_user_ratings: int = 10
    ) -> pd.DataFrame:
    """
    Filters out movies and users with very few ratings to reduce noise.

    Parameters:
        df (pd.DataFrame): The original ratings DataFrame. Must contain 'movie_id' and 'customer_id' columns.
        min_movie_ratings (int): Minimum number of ratings required for a movie to be kept.
        min_user_ratings (int): Minimum number of ratings required for a user to be kept.

    Returns:
        pd.DataFrame: Filtered DataFrame with less sparse movies and users.
    """
    # Convert columns to NumPy arrays for faster processing
    movie_ids = df['movie_id'].values
    user_ids = df['customer_id'].values

    # Get counts using NumPy (faster than pandas.value_counts)
    movie_unique, movie_counts = np.unique(movie_ids, return_counts=True)
    user_unique, user_counts = np.unique(user_ids, return_counts=True)

    # Create sets of allowed IDs for fast lookup
    valid_movies = set(movie_unique[movie_counts >= min_movie_ratings])
    valid_users = set(user_unique[user_counts >= min_user_ratings])

    # Use NumPy boolean indexing
    mask = np.isin(movie_ids, list(valid_movies)) & np.isin(user_ids, list(valid_users))
    return df[mask]


min_movie_ratings = 50
min_user_ratings = 10
df = filter_sparse_users_and_movies(df, min_movie_ratings=min_movie_ratings, min_user_ratings=min_user_ratings)

In [None]:
# 3m 47.5s

def filter_sparse_users_and_movies(
    df: pd.DataFrame,
    min_movie_ratings: int = 50,
    min_user_ratings: int = 10
) -> pd.DataFrame:
    """
    Filters out movies and users with very few ratings to reduce noise.

    Returns:
        pd.DataFrame: Filtered DataFrame.
    """
    # Count movie ratings
    movie_counts = df['movie_id'].value_counts()
    valid_movies = movie_counts.index[movie_counts >= min_movie_ratings]

    # Filter once by movies
    df = df[df['movie_id'].isin(valid_movies)]

    # Count user ratings on the filtered set
    user_counts = df['customer_id'].value_counts()
    valid_users = user_counts.index[user_counts >= min_user_ratings]

    # Final filter by users
    df = df[df['customer_id'].isin(valid_users)]

    return df

min_movie_ratings = 50
min_user_ratings = 10
df = filter_sparse_users_and_movies(df, min_movie_ratings=min_movie_ratings, min_user_ratings=min_user_ratings)

In [11]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data1.parquet"
df.to_parquet(processed_data_path)