In [None]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
from utils.files_management import fix_csv_with_commas_in_text, load_netflix_data, load_multiple_netflix_files
from utils.data_processing import filter_sparse_users_and_movies, filter_valid_ratings, convert_columns_to_string
from utils.data_split import temporal_train_test_split

In [None]:
combined_data_1_path = RAW_DATA_PATH / "combined_data_1.txt"
combined_data_2_path = RAW_DATA_PATH / "combined_data_2.txt"
combined_data_3_path = RAW_DATA_PATH / "combined_data_3.txt"
combined_data_4_path = RAW_DATA_PATH / "combined_data_4.txt"

combined_data_path_list = [combined_data_1_path, combined_data_2_path, combined_data_3_path, combined_data_4_path]

concatenated_data = RAW_DATA_PATH / "data.parquet"

movie_titles_path = RAW_DATA_PATH / "movie_titles.csv"
movie_titles_fixed_path = RAW_DATA_PATH / "movie_titles_fixed.csv"


In [None]:
#df = load_multiple_netflix_files(
#    file_paths=combined_data_path_list,
#    save_path=concatenated_data,
#    verbose=False
#)

df = pd.read_parquet(concatenated_data)

In [None]:
#fix_csv_with_commas_in_text(movie_titles_path, movie_titles_fixed_path)
movie_titles = pd.read_csv(movie_titles_fixed_path, sep=';', encoding='latin1', header=None, names=['id', 'year', 'title'])

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.isna().mean()

# Preprocessing

In [None]:
df = convert_columns_to_string(df, ['customer_id', 'movie_id'])

In [None]:
# All the ratings are suposed to be in between 1-5
min_rating = 1
max_rating = 5
df = filter_valid_ratings(df, min_rating=min_rating, max_rating=max_rating)

In [None]:
min_movie_ratings = 50
min_user_ratings = 10
df = filter_sparse_users_and_movies(df, min_movie_ratings=min_movie_ratings, min_user_ratings=min_user_ratings)

In [None]:
processed_data_path = PROCESSED_DATA_PATH / "processed_data.parquet"
df.to_parquet(processed_data_path)

# EDA

#### Distribution of ratings

In [None]:
import matplotlib.pyplot as plt

# Tu serie con proporciones
rating_dist = df['rating'].value_counts(normalize=True).sort_index()

plt.figure(figsize=(8,5))
plt.bar(rating_dist.index, rating_dist.values, color='skyblue')
plt.xlabel('Rating')
plt.ylabel('Proportion')
plt.title('Distribution of Ratings')
plt.xticks(rating_dist.index)  # para mostrar todos los valores del eje x
plt.show()

In [None]:
print(f"Number of distinct movies: {df['movie_id'].nunique()}")
print(f"Number of distinct users:  {df['customer_id'].nunique()}")

In [None]:
ratings_per_movie = df.groupby('movie_id')['rating'].count()
ratings_per_movie.describe()

In [None]:
avg_rating_per_movie = df.groupby('movie_id')['rating'].mean()
best_rated_movies = avg_rating_per_movie.sort_values(ascending=False).head(10)
worst_rated_movies = avg_rating_per_movie.sort_values().head(10)
print(f"Best rated movies: \n{best_rated_movies}")
print(f"\Worst rated movies: \n{worst_rated_movies}")

In [None]:
most_rated_movies = ratings_per_movie.sort_values(ascending=False).head(10)
least_rated_movies = ratings_per_movie.sort_values().head(10)
print(f"Most rated movies: \n{most_rated_movies}")
print(f"\nLeast rated movies: \n{least_rated_movies}")