In [None]:
import sys
import numpy as np
import pandas as pd
sys.path.append('../')

from config.paths import RAW_DATA_PATH
from utils.files_management import fix_csv_with_commas_in_text, load_netflix_data, load_multiple_netflix_files
from utils.filters import filter_sparse_users_and_movies, filter_valid_ratings
from utils.data_split import temporal_train_test_split

In [2]:
combined_data_1_path = RAW_DATA_PATH / "combined_data_1.txt"
combined_data_2_path = RAW_DATA_PATH / "combined_data_2.txt"
combined_data_3_path = RAW_DATA_PATH / "combined_data_3.txt"
combined_data_4_path = RAW_DATA_PATH / "combined_data_4.txt"

combined_data_path_list = [combined_data_1_path, combined_data_2_path, combined_data_3_path, combined_data_4_path]

concatenated_data = RAW_DATA_PATH / "data.parquet"

movie_titles_path = RAW_DATA_PATH / "movie_titles.csv"
movie_titles_fixed_path = RAW_DATA_PATH / "movie_titles_fixed.csv"


In [3]:
#df = load_multiple_netflix_files(
#    file_paths=combined_data_path_list,
#    save_path=concatenated_data,
#    verbose=False
#)

df = pd.read_parquet(concatenated_data)

In [4]:
#fix_csv_with_commas_in_text(movie_titles_path, movie_titles_fixed_path)
movie_titles = pd.read_csv(movie_titles_fixed_path, sep=';', encoding='latin1', header=None, names=['id', 'year', 'title'])

In [5]:
df.head()

Unnamed: 0,movie_id,customer_id,rating,date
0,1,1488844,3.0,2005-09-06
1,1,822109,5.0,2005-05-13
2,1,885013,4.0,2005-10-19
3,1,30878,4.0,2005-12-26
4,1,823519,3.0,2004-05-03


In [6]:
df.dtypes

movie_id                int64
customer_id             int64
rating                float64
date           datetime64[ns]
dtype: object

In [7]:
df.isna().mean()

movie_id       0.0
customer_id    0.0
rating         0.0
date           0.0
dtype: float64

# Preprocessing

In [8]:
df['customer_id'] = np.array(df['customer_id'], dtype=str)
df['movie_id'] = np.array(df['movie_id'], dtype=str)


In [9]:
# All the ratings are suposed to be in between 1-5
min_rating = 1
max_rating = 5
df = filter_valid_ratings(df, min_rating=min_rating, max_rating=max_rating)

In [None]:
min_movie_ratings = 50
min_user_ratings = 10
df_filtered = filter_sparse_users_and_movies(df, min_movie_ratings=min_movie_ratings, min_user_ratings=min_user_ratings)

# EDA

#### Distribution of ratings

In [None]:
import matplotlib.pyplot as plt

# Tu serie con proporciones
rating_dist = df['rating'].value_counts(normalize=True).sort_index()

plt.figure(figsize=(8,5))
plt.bar(rating_dist.index, rating_dist.values, color='skyblue')
plt.xlabel('Rating')
plt.ylabel('Proportion')
plt.title('Distribution of Ratings')
plt.xticks(rating_dist.index)  # para mostrar todos los valores del eje x
plt.show()

In [None]:
print(f"Number of distinct movies: {df['movie_id'].nunique()}")
print(f"Number of distinct users:  {df['customer_id'].nunique()}")

In [None]:
ratings_per_movie = df.groupby('movie_id')['rating'].count()
ratings_per_movie.describe()

In [None]:
avg_rating_per_movie = df.groupby('movie_id')['rating'].mean()
best_rated_movies = avg_rating_per_movie.sort_values(ascending=False).head(10)
worst_rated_movies = avg_rating_per_movie.sort_values().head(10)
print(f"Best rated movies: \n{best_rated_movies}")
print(f"\Worst rated movies: \n{worst_rated_movies}")

In [None]:
most_rated_movies = ratings_per_movie.sort_values(ascending=False).head(10)
least_rated_movies = ratings_per_movie.sort_values().head(10)
print(f"Most rated movies: \n{most_rated_movies}")
print(f"\nLeast rated movies: \n{least_rated_movies}")

# Train Test Split

In [None]:
test_size = 0.2
train_df, test_df = temporal_train_test_split(df, test_size=test_size)

# Preprocess train dataset

In [None]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# 1. Define el formato de los datos
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_df[['customer_id', 'movie_id', 'rating']], reader)

# 2. Divide en train/test (aquí podrías usar tu test original también)
trainset = data.build_full_trainset()
testset = list(zip(test_df['customer_id'], test_df['movie_id'], test_df['rating']))

# 3. Entrena el modelo
model = SVD()
model.fit(trainset)

# 4. Evalúa
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
