In [12]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

# Encontrar el codo
from kneed import KneeLocator
from scipy.sparse import hstack

from ipywidgets import interact, widgets

In [16]:
file_path = os.path.join("/Users/sergio/Desktop/TFG-Movistar/src/JupiterNotebook/datos/movies.dat")

# Verifica si el archivo existe
if os.path.exists(file_path):
    try:
        movies = pd.read_csv(file_path, sep="\t", engine="python", header=0, encoding="ISO-8859-1",
                             names=["MovieID", "Title", "imdbID", "spanishTitle", "imdbPictureURL", "year",
                                    "rtID", "rtAllCriticsRating", "rtAllCriticsNumReviews", "rtAllCriticsNumFresh",
                                    "rtAllCriticsNumRotten", "rtAllCriticsScore", "rtTopCriticsRating", "rtTopCriticsNumReviews",
                                    "rtTopCriticsNumFresh", "rtTopCriticsNumRotten", "rtTopCriticsScore", "rtAudienceRating",
                                    "rtAudienceNumRatings", "rtAudienceScore", "rtPictureURL"])
        print("Archivo cargado correctamente.")
    except Exception as e:
        print(f"Error al cargar el archivo: {e}")
else:
    print("El archivo no existe en la ruta especificada.")

Archivo cargado correctamente.


In [17]:
movies.shape

(10197, 21)

In [18]:
movies

Unnamed: 0,MovieID,Title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3,13761,64,http://content8.flixster.com/movie/25/54/25542...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10192,65088,Bedtime Stories,960731,Más allá de los sueños,http://ia.media-imdb.com/images/M/MV5BMjA5Njk5...,2008,bedtime_stories,4.4,104,26,...,25,4.7,26,6,20,23,3.5,108877,63,http://content6.flixster.com/movie/10/94/33/10...
10193,65091,Manhattan Melodrama,25464,El enemigo público número 1,http://ia.media-imdb.com/images/M/MV5BMTUyODE3...,1934,manhattan_melodrama,7,12,10,...,83,0,4,2,2,50,3.7,344,71,http://content9.flixster.com/movie/66/44/64/66...
10194,65126,Choke,1024715,Choke,http://ia.media-imdb.com/images/M/MV5BMTMxMDI4...,2008,choke,5.6,135,73,...,54,4.9,26,8,18,30,3.3,13893,55,http://content6.flixster.com/movie/10/85/09/10...
10195,65130,Revolutionary Road,959337,Revolutionary Road,http://ia.media-imdb.com/images/M/MV5BMTI2MzY2...,2008,revolutionary_road,6.7,194,133,...,68,6.9,36,25,11,69,3.5,46044,70,http://content8.flixster.com/movie/10/88/40/10...


In [19]:
#Devolver una tupla que contiene el número de filas y el número de columnas en el DataFrame.
movies.shape

(10197, 21)

In [24]:
movies

Unnamed: 0,MovieID,Title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3,13761,64,http://content8.flixster.com/movie/25/54/25542...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10192,65088,Bedtime Stories,960731,Más allá de los sueños,http://ia.media-imdb.com/images/M/MV5BMjA5Njk5...,2008,bedtime_stories,4.4,104,26,...,25,4.7,26,6,20,23,3.5,108877,63,http://content6.flixster.com/movie/10/94/33/10...
10193,65091,Manhattan Melodrama,25464,El enemigo público número 1,http://ia.media-imdb.com/images/M/MV5BMTUyODE3...,1934,manhattan_melodrama,7,12,10,...,83,0,4,2,2,50,3.7,344,71,http://content9.flixster.com/movie/66/44/64/66...
10194,65126,Choke,1024715,Choke,http://ia.media-imdb.com/images/M/MV5BMTMxMDI4...,2008,choke,5.6,135,73,...,54,4.9,26,8,18,30,3.3,13893,55,http://content6.flixster.com/movie/10/85/09/10...
10195,65130,Revolutionary Road,959337,Revolutionary Road,http://ia.media-imdb.com/images/M/MV5BMTI2MzY2...,2008,revolutionary_road,6.7,194,133,...,68,6.9,36,25,11,69,3.5,46044,70,http://content8.flixster.com/movie/10/88/40/10...


In [25]:
# Visualizar las primeras filas para asegurarnos de que se haya cargado correctamente
movies.head()

Unnamed: 0,MovieID,Title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,...,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,...,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,...,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,...,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,...,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,...,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


In [26]:
movies.columns

Index(['MovieID', 'Title', 'imdbID', 'spanishTitle', 'imdbPictureURL', 'year',
       'rtID', 'rtAllCriticsRating', 'rtAllCriticsNumReviews',
       'rtAllCriticsNumFresh', 'rtAllCriticsNumRotten', 'rtAllCriticsScore',
       'rtTopCriticsRating', 'rtTopCriticsNumReviews', 'rtTopCriticsNumFresh',
       'rtTopCriticsNumRotten', 'rtTopCriticsScore', 'rtAudienceRating',
       'rtAudienceNumRatings', 'rtAudienceScore', 'rtPictureURL'],
      dtype='object')

In [27]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10197 entries, 0 to 10196
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   MovieID                 10197 non-null  int64 
 1   Title                   10197 non-null  object
 2   imdbID                  10197 non-null  int64 
 3   spanishTitle            10197 non-null  object
 4   imdbPictureURL          10016 non-null  object
 5   year                    10197 non-null  int64 
 6   rtID                    9886 non-null   object
 7   rtAllCriticsRating      10197 non-null  object
 8   rtAllCriticsNumReviews  10197 non-null  object
 9   rtAllCriticsNumFresh    10197 non-null  object
 10  rtAllCriticsNumRotten   10197 non-null  object
 11  rtAllCriticsScore       10197 non-null  object
 12  rtTopCriticsRating      10197 non-null  object
 13  rtTopCriticsNumReviews  10197 non-null  object
 14  rtTopCriticsNumFresh    10197 non-null  object
 15  rt

In [None]:
movies.isnull().sum()

Solo tenemos nulos en las columnas de imdbPictureURL y rtID

In [None]:
movies.rtID.value_counts()

In [None]:
# Visualizar la distribución de algunas variables
plt.figure(figsize=(10, 6))
sns.histplot(movies['year'], bins=30, kde=True)
plt.title('Distribución del Año de Lanzamiento')
plt.show()

In [None]:
# Distribución del rating de todos los críticos (Rotten Tomatoes) con ajustes en el eje X
plt.figure(figsize=(20, 6))
sns.histplot(movies['rtAllCriticsRating'], bins=30, kde=True)
plt.title('Distribución del Rating de Todos los Críticos (Rotten Tomatoes)')
plt.xticks(rotation=45)  # Inclinar las etiquetas del eje X 45 grados
plt.show()

Vamos a convertir las variables de tipo rating o score a numericas. Las que tengan valores de \N lo convierte a null

In [None]:
print(movies['rtAllCriticsRating'].dtype)

In [None]:
movies['rtAllCriticsRating'] = pd.to_numeric(movies['rtAllCriticsRating'], errors='coerce')

In [None]:
print(movies['rtAllCriticsRating'].dtype)


In [None]:
print(movies['rtAllCriticsRating'].head())

In [None]:
# Distribución del rating de todos los críticos (Rotten Tomatoes) con ajustes en el eje X
plt.figure(figsize=(10, 6))
sns.histplot(movies['rtAllCriticsRating'], bins=30, kde=True)
plt.title('Distribución del Rating de Todos los Críticos (Rotten Tomatoes)')
plt.xticks(rotation=45)  # Inclinar las etiquetas del eje X 45 grados
plt.show()

In [None]:
movies.dtypes

In [None]:
#movies_cleaned = movies.copy()
movies['rtAllCriticsRating'] = pd.to_numeric(movies['rtAllCriticsRating'], errors='coerce')



In [None]:
movies.rtAllCriticsNumReviews.value_counts()

In [None]:
movies['rtAllCriticsNumReviews'] = pd.to_numeric(movies_cleaned['rtAllCriticsNumReviews'], errors='coerce')

In [None]:
movies.dtypes

In [None]:
# Convertir la columna a tipo numérico, forzando cualquier valor no numérico a NaN
movies['rtAllCriticsNumReviews'] = pd.to_numeric(movies['rtAllCriticsNumReviews'], errors='coerce')

In [None]:
movies.rtAllCriticsNumReviews.value_counts()

In [None]:
print(movies['rtAllCriticsNumReviews'].dtype)

In [None]:
# Distribución del rating de todos los críticos (Rotten Tomatoes) con ajustes en el eje X
plt.figure(figsize=(10, 6))
sns.histplot(movies['rtAllCriticsNumReviews'], bins=30, kde=True)
plt.title('Distribución del Número de reviews de Todos los Críticos (Rotten Tomatoes)')
plt.xticks(rotation=45)  # Inclinar las etiquetas del eje X 45 grados
plt.show()