# Transformaciones a los datos

In [128]:
#Importar librerias necesarias para el proceso de transformación de datos
import pandas as pd
import numpy as np
import json
import ast

In [129]:
#Realizar la lectura del archivo "movies_dataset"
movies = pd.read_csv("movies_dataset.csv", delimiter=",", low_memory=False, encoding="utf-8",)
movies.head(3) #ver encabezado para verificar correcta lectura del archivo

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [130]:
movies.info() #verificación de campos nulos y tipo de datos

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

# Extraer información de columnas diccionarios / listas
Se realizará la extracción de la información contenida de las siguientes columnas:
belongs_to_collection
genres
production_companies
production_countries
spoken_languages
Las cuales se encuentran  anidados, esto es porque ienen un diccionario o una lista como valores en cada fila.

In [131]:
#Convertir "belong_to_collection" a diccionario para poder acceder a los valores
movies["belongs_to_collection"] = movies["belongs_to_collection"].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else {})

In [132]:
#Verificar las claves del diccionario belong_to_collectio
movies["belongs_to_collection"][0].keys()

dict_keys(['id', 'name', 'poster_path', 'backdrop_path'])

In [133]:
#Generar una nueva columna con los valores en el diccionario accediendo por claves
#Las nuevas columnas creadas se les agregará _collections para identificar que provienen de esa columna
movies["id_collections"]=movies["belongs_to_collection"].apply(lambda x: x.get("id") if isinstance(x, dict) else None)
movies["name_collections"]=movies["belongs_to_collection"].apply(lambda x: x.get("name") if isinstance(x, dict) else None)
movies["poster_path_collections"]=movies["belongs_to_collection"].apply(lambda x: x.get("poster_path") if isinstance(x, dict) else None)
movies["backdrop_path_collections"]=movies["belongs_to_collection"].apply(lambda x: x.get("backdrop_path") if isinstance(x, dict) else None)
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,id_collections,name_collections,poster_path_collections,backdrop_path_collections
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,False,{},65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,,,,
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,119050.0,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg
3,False,{},16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,,,,
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,96871.0,Father of the Bride Collection,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg


In [134]:
# Se crea la función safe_eval
def safe_eval(x):
    '''
    Función para convertir una cadena de texto a una lista que contiene diccionarios
    Devuelve una lista vacia en caso de error
    Si no es una cadena de texto, devuelve una lista vacia
    '''
    if isinstance(x, str):
        try:
            # Se evalua si el string si es válido
            return ast.literal_eval(x)
        except (ValueError, SyntaxError) as e:
            print(f"Error al convertir: {x}, Error: {e}")
            return []  # En caso de error, devolvemos lista vacía
    return []  # Si no es una cadena, devolvemos lista vacía



In [135]:
# Se aplica la función para convertir la columna 'genres'
movies['genres'] = movies['genres'].apply(safe_eval)

In [136]:
#Se crea un bucle for para acceder a los elementos en la lista con su clave
#Solo se extrae la información a los valores name de genres y nos sus Id por no ser relevantes para el análisis
# Solo se extraen hasta 3 generos por pelicula
for i in range(3):
    movies[f'genre_{i+1}_name'] = movies['genres'].apply(lambda x: x[i]['name'] if isinstance(x, list) and len(x) > i else None)

In [137]:
#Se verifica la creación de las nuevas columnas genre
movies.head() 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,id_collections,name_collections,poster_path_collections,backdrop_path_collections,genre_1_name,genre_2_name,genre_3_name
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,10194.0,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg,Animation,Comedy,Family
1,False,{},65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,,,,,Adventure,Fantasy,Family
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92.0,119050.0,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg,Romance,Comedy,
3,False,{},16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34.0,,,,,Comedy,Drama,Romance
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173.0,96871.0,Father of the Bride Collection,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg,Comedy,,


In [138]:
# Se replica acciones ejecutadas para convertir y extraer información de la columna production_companies
#Solo se extraen hasta 3 compañías productoras y solo los nombres de las compañías
movies["production_companies"]= movies["production_companies"].apply(safe_eval)
for i in range(3):
    movies[f'companies_{i+1}_name'] = movies["production_companies"].apply(lambda x: x[i]['name'] if isinstance(x, list) and len(x) > i else None)

In [139]:
movies[['companies_1_name', 'companies_2_name','companies_3_name']].head() #verificar las nuevas columnas creadas

Unnamed: 0,companies_1_name,companies_2_name,companies_3_name
0,Pixar Animation Studios,,
1,TriStar Pictures,Teitler Film,Interscope Communications
2,Warner Bros.,Lancaster Gate,
3,Twentieth Century Fox Film Corporation,,
4,Sandollar Productions,Touchstone Pictures,


In [140]:
# Se replica acciones ejecutadas para convertir y extraer información de la columna production_countries
#Solo se extraen hasta 3 paises productores y solo los nombres de los países
movies["production_countries"]= movies["production_countries"].apply(safe_eval)
for i in range(3):
    movies[f'country_{i+1}_name'] = movies["production_countries"].apply(lambda x: x[i]['name'] if isinstance(x, list) and len(x) > i else None)

In [141]:
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'id_collections', 'name_collections',
       'poster_path_collections', 'backdrop_path_collections', 'genre_1_name',
       'genre_2_name', 'genre_3_name', 'companies_1_name', 'companies_2_name',
       'companies_3_name', 'country_1_name', 'country_2_name',
       'country_3_name'],
      dtype='object')

In [142]:
movies[['country_1_name', 'country_2_name','country_3_name']].head() #verificar las columnas de paises

Unnamed: 0,country_1_name,country_2_name,country_3_name
0,United States of America,,
1,United States of America,,
2,United States of America,,
3,United States of America,,
4,United States of America,,


In [143]:
# Se replica acciones ejecutadas para convertir y extraer información de la columna spoken_languages
#Solo se extraen hasta 3 lenguajes hablados
movies["spoken_languages"]= movies["spoken_languages"].apply(safe_eval)
for i in range(3):
    movies[f'language_{i+1}_iso'] = movies["spoken_languages"].apply(lambda x: x[i]['iso_639_1'] if isinstance(x,list) and len(x) > i else None)
    movies[f'language_{i+1}_name'] = movies["spoken_languages"].apply(lambda x: x[i]['name'] if isinstance(x, list) and len(x) > i else None)

In [144]:
movies.columns #verificar las columnas creadas

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'id_collections', 'name_collections',
       'poster_path_collections', 'backdrop_path_collections', 'genre_1_name',
       'genre_2_name', 'genre_3_name', 'companies_1_name', 'companies_2_name',
       'companies_3_name', 'country_1_name', 'country_2_name',
       'country_3_name', 'language_1_iso', 'language_1_name', 'language_2_iso',
       'language_2_name', 'language_3_iso', 'language_3_name'],
      dtype='object')

In [145]:
#verificar los valores en las nuevas columnas
movies[['language_1_iso', 'language_1_name', 'language_2_iso','language_2_name', 'language_3_iso', 'language_3_name']].head()

Unnamed: 0,language_1_iso,language_1_name,language_2_iso,language_2_name,language_3_iso,language_3_name
0,en,English,,,,
1,en,English,fr,Français,,
2,en,English,,,,
3,en,English,,,,
4,en,English,,,,


# Rellenar valores nulos en columnas revenue y budget
Se procede a rellenar con el número 0 los campos nulos que se encuentran en las columnas revenue y budget

In [146]:
# Relleno de valores nulos con 0
movies["revenue"]= movies["revenue"].fillna(0)
movies["budget"] = movies["budget"].fillna(0)

In [147]:
#verificar los tipos de datos de las columnas revenue y budget
movies[["revenue", "budget"]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   revenue  45466 non-null  float64
 1   budget   45466 non-null  object 
dtypes: float64(1), object(1)
memory usage: 710.5+ KB


In [148]:
#Se reliza el cambio del tipo objet to float en la columna budget

#Se identifican valores que terminan en .jpg en la columna, por lo que se reemplazan por 0
movies["budget"] = movies["budget"].str.replace(r".*\.jpg$","0", regex=True) 
#Ahora si es posible realizar el cambio en el tipo de dato
movies["budget"]=movies["budget"].astype(float) #Convertir el tipo de datos de la columa buget a número float


In [149]:
#Se procede a verificar los cambios ejecutados en las columnas con ayuda de un describe
pd.options.display.float_format = '{:.2f}'.format #quitar la visualización de notación cientifica en el output del .describe()
movies[["revenue","budget"]].describe()

Unnamed: 0,revenue,budget
count,45466.0,45466.0
mean,11207869.28,4224300.06
std,64328130.52,17423591.55
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,2787965087.0,380000000.0


# Eliminar valores nulos de columna Release Date
Se procede a eliminar las filas que tienen valores nulos en la columna release date

In [150]:
#Se verifican cuantos valores nulos tiene la columna release date
movies["release_date"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 45466 entries, 0 to 45465
Series name: release_date
Non-Null Count  Dtype 
--------------  ----- 
45379 non-null  object
dtypes: object(1)
memory usage: 355.3+ KB


In [151]:
movies = movies.dropna(subset="release_date")

In [152]:
#Se verifica el proceso realizado 
movies["release_date"].info()

<class 'pandas.core.series.Series'>
Index: 45379 entries, 0 to 45465
Series name: release_date
Non-Null Count  Dtype 
--------------  ----- 
45379 non-null  object
dtypes: object(1)
memory usage: 709.0+ KB


In [153]:
#Se resetea el index luego de eliminar filas
movies = movies.reset_index(drop=True)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45379 entries, 0 to 45378
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   adult                      45379 non-null  object 
 1   belongs_to_collection      45379 non-null  object 
 2   budget                     45379 non-null  float64
 3   genres                     45379 non-null  object 
 4   homepage                   7769 non-null   object 
 5   id                         45379 non-null  object 
 6   imdb_id                    45365 non-null  object 
 7   original_language          45368 non-null  object 
 8   original_title             45379 non-null  object 
 9   overview                   44438 non-null  object 
 10  popularity                 45377 non-null  object 
 11  poster_path                45040 non-null  object 
 12  production_companies       45379 non-null  object 
 13  production_countries       45379 non-null  obj

# Verificar el formato de las columnas que contienen fechas


In [154]:
#Verificar el tipo de dato de la columna release date
movies["release_date"].dtype

dtype('O')

In [155]:
#Convertir a tipo fecha los datos contenidos en cada fila de la columna release_date
#Se usa un bucle for para que se ingrese a cada fila y se verifique la solicitud de conversión de tipo de dato
# En caso que no se pueda convertir, identifique la fila y el error que impide la conversión.
for ind, fila in movies.iterrows():
    try:
        movies.loc[ind,"release_date"] = pd.to_datetime(fila["release_date"])
    except ValueError as e:
        print(f"Error al convertir la fila {ind}: {fila["release_date"]} {e}")

Error al convertir la fila 19714: 1 Given date string "1" not likely a datetime, at position 0
Error al convertir la fila 29472: 12 Given date string "12" not likely a datetime, at position 0
Error al convertir la fila 35543: 22 Given date string "22" not likely a datetime, at position 0


In [156]:
#Se eliminan estas filas dado que se consideran como datos nulos en la columna release_date 
#Siendo consistente con el proceso anterior (eliminar filas que tienen valores nulos en la columna "release_date")
movies.drop(index=[19714,29472,35543], inplace= True)

In [157]:
#Se pasan los formatos de las fechas a AAAA-mm-dd
movies["release_date"] = pd.to_datetime(movies["release_date"], format="%Y-%m-%d")

In [158]:
#Se verifica el formato de la columna
movies["release_date"].dtype

dtype('<M8[ns]')

# Creación de columna release_year
Se procede a crear la columna release_year con los años en la fecha de estreno (release_date)

In [159]:
movies["release_year"] = movies["release_date"].dt.year

In [160]:
# Se verifica la nueva columna
movies["release_year"].describe()

count   45376.00
mean     1991.88
std        24.06
min      1874.00
25%      1978.00
50%      2001.00
75%      2010.00
max      2020.00
Name: release_year, dtype: float64

# Crear la columna con el retorno de inversión Return
Se procede a crear la columna return usando la siguiente formula: revenue/budget, cuando no hay valores para el calculo, tomará el valor 0

In [161]:
movies["return"] = np.where((movies["revenue"] == 0) | (movies["budget"] == 0), 0, movies["revenue"] / movies ["budget"])

In [162]:
#Se verifica la nueva columna
movies["return"].describe()

count      45376.00
mean         660.04
std        74693.29
min            0.00
25%            0.00
50%            0.00
75%            0.00
max     12396383.00
Name: return, dtype: float64

# Eliminar columnas no deseadas
Se procede a eliminar las columnas que no serán utilizadas, video,imdb_id,adult,original_title,poster_path y homepage.

In [163]:
#Conocer el total de columnas actual, antes de eliminar
movies.shape[1]

45

In [164]:
#Se verifica las columnas que hay en el data set
movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'id_collections', 'name_collections',
       'poster_path_collections', 'backdrop_path_collections', 'genre_1_name',
       'genre_2_name', 'genre_3_name', 'companies_1_name', 'companies_2_name',
       'companies_3_name', 'country_1_name', 'country_2_name',
       'country_3_name', 'language_1_iso', 'language_1_name', 'language_2_iso',
       'language_2_name', 'language_3_iso', 'language_3_name', 'release_year',
       'return'],
      dtype='object')

In [165]:
#Se seleccionan estas columnas que no serán utilizadas, además de las enunciadas en esta sección
movies.drop(["video", "imdb_id", "adult", "original_title", "poster_path", "homepage","poster_path_collections",
             "backdrop_path_collections", "belongs_to_collection", "genres", 'production_companies','production_countries'],
              axis=1, inplace=True)

In [166]:
movies.shape[1] #Verificar el número final de columnas

33

# Transformaciones a Credits
Se procede a realizar la lectura del archivo credits y a extraer los datos anidados en las columnas cast y crew

In [167]:
#Leer archivo credis
credits = pd.read_csv("credits.csv", sep= ",", encoding="utf-8")
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [168]:
credits = credits.rename(columns={"id":"id_film"})

In [169]:
credits.head()

Unnamed: 0,cast,crew,id_film
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [170]:
#Se aplica la función safe_eval para convertir a una lista de diccionarios las columnas cast y crew
credits["cast"]=credits["cast"].apply(safe_eval)
credits["crew"]= credits["crew"].apply(safe_eval)

In [171]:
# Se aplica explode a  la columna 'cast' para crear filas individuales con los datos de la columna cast
expadir_cast = credits.explode('cast')

In [172]:
# Se convierte la columna 'cast' en un DataFrame para tener los datos de los actores separados
#Se elimina la columna cast y crew original del nuevo data set
cast = pd.concat([expadir_cast.drop(['cast','crew'], axis=1), expadir_cast['cast'].apply(pd.Series)], axis=1)

In [173]:
#Se verifica lo realizado 
cast.head(5)

Unnamed: 0,id_film,cast_id,character,credit_id,gender,id,name,order,profile_path,0
0,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg,
0,862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg,
0,862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg,
0,862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg,
0,862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg,


In [174]:
#Se borra una columna creada llamada 0
cast = cast.drop([0], axis=1)

In [175]:
#Se vuelve a verificar lo realizado en el dataset cast
cast.head()

Unnamed: 0,id_film,cast_id,character,credit_id,gender,id,name,order,profile_path
0,862,14.0,Woody (voice),52fe4284c3a36847f8024f95,2.0,31.0,Tom Hanks,0.0,/pQFoyx7rp09CJTAb932F2g8Nlho.jpg
0,862,15.0,Buzz Lightyear (voice),52fe4284c3a36847f8024f99,2.0,12898.0,Tim Allen,1.0,/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg
0,862,16.0,Mr. Potato Head (voice),52fe4284c3a36847f8024f9d,2.0,7167.0,Don Rickles,2.0,/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg
0,862,17.0,Slinky Dog (voice),52fe4284c3a36847f8024fa1,2.0,12899.0,Jim Varney,3.0,/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg
0,862,18.0,Rex (voice),52fe4284c3a36847f8024fa5,2.0,12900.0,Wallace Shawn,4.0,/oGE6JqPP2xH4tNORKNqxbNPYi7u.jpg


In [176]:
#Se replica lo realizado con los datos de la columna crew, creando un dataset llamado "crew"
expadir_crew = credits.explode('crew')
crew = pd.concat([expadir_crew.drop(['cast','crew'], axis=1), expadir_crew['crew'].apply(pd.Series)], axis=1)
crew = crew.drop([0],axis=1)
crew.head()

Unnamed: 0,id_film,credit_id,department,gender,id,job,name,profile_path
0,862,52fe4284c3a36847f8024f49,Directing,2.0,7879.0,Director,John Lasseter,/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg
0,862,52fe4284c3a36847f8024f4f,Writing,2.0,12891.0,Screenplay,Joss Whedon,/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg
0,862,52fe4284c3a36847f8024f55,Writing,2.0,7.0,Screenplay,Andrew Stanton,/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg
0,862,52fe4284c3a36847f8024f5b,Writing,2.0,12892.0,Screenplay,Joel Cohen,/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg
0,862,52fe4284c3a36847f8024f61,Writing,0.0,12893.0,Screenplay,Alec Sokolow,/v79vlRYi94BZUQnkkyznbGUZLjT.jpg


# Dataset Finales
Se procede a evaluar elcontenido de los dataset finales y realizar algunas depuraciones para el funcionamiento de la API

In [177]:
#Información de los campos en Movies
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45378
Data columns (total 33 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             45376 non-null  float64       
 1   id                 45376 non-null  object        
 2   original_language  45365 non-null  object        
 3   overview           44435 non-null  object        
 4   popularity         45376 non-null  object        
 5   release_date       45376 non-null  datetime64[ns]
 6   revenue            45376 non-null  float64       
 7   runtime            45130 non-null  float64       
 8   spoken_languages   45376 non-null  object        
 9   status             45296 non-null  object        
 10  tagline            20398 non-null  object        
 11  title              45376 non-null  object        
 12  vote_average       45376 non-null  float64       
 13  vote_count         45376 non-null  float64       
 14  id_collecti

In [178]:
movies["language_1_name"].unique() #Verificar los lenguales en la pelicula

array(['English', 'Français', '广州话 / 廣州話', '普通话', 'Pусский', 'Español',
       None, 'shqip', 'Italiano', 'Deutsch', 'فارسی', 'Nederlands',
       'Dansk', 'العربية', 'Magyar', '', 'Český', 'svenska', '日本語',
       'Português', 'Català', '한국어/조선말', 'Afrikaans', 'বাংলা', 'עִבְרִית',
       'Latin', 'Cymraeg', 'Tiếng Việt', 'Polski', 'български език',
       'ελληνικά', 'Norsk', 'Bosanski', 'Gaeilge', 'Bokmål',
       'Український', 'No Language', 'Kiswahili', 'Srpski', 'हिन्दी',
       'Azərbaycan', 'ภาษาไทย', 'Bamanankan', 'suomi', 'Română',
       'Hrvatski', 'Türkçe', 'ქართული', 'Slovenčina', 'беларуская мова',
       'Esperanto', 'Galego', 'Íslenska', 'isiZulu', 'Eesti', 'Latviešu',
       'қазақ', 'Slovenščina', 'Bahasa indonesia', 'Wolof', 'اردو',
       'Kinyarwanda', 'euskera', 'Bahasa melayu', 'தமிழ்', 'తెలుగు',
       'Lietuvi\x9akai', '?????', '??????', 'پښتو', 'ਪੰਜਾਬੀ', 'Somali',
       'Hausa', 'Fulfulde'], dtype=object)

In [179]:
#Vamos a filtrar solo las peliculas que hablan los idiomas especificados:
lenguajes_final =['English', 'Français', 'Español', 'Italiano']
movies = movies[movies['language_1_name'].isin(lenguajes_final)]
print(movies['language_1_name'].unique())
print('total de filas',movies.__len__())

['English' 'Français' 'Español' 'Italiano']
total de filas 31804


In [180]:
# Se validar los valores de los años de estreno 
print('año más antiguo',movies["release_year"].min())
print('año más nuevo', movies['release_year'].max())

año más antiguo 1902
año más nuevo 2020


In [181]:
#Se van a filtrar solo las peliculas con estreno posterior a 1980
movies=movies[movies['release_year']>= 1980]
print('año más antiguo',movies["release_year"].min())
print('año más nuevo', movies['release_year'].max())
print('total de filas',movies.__len__())

año más antiguo 1980
año más nuevo 2020
total de filas 22983


In [193]:
movies["id"].dtype

dtype('int64')

In [196]:
# Se van a igual las peliculas en el dataset "cast" con las peliculas finales en el dataset "movies"
movies["id"]= movies["id"].astype(int) #Se convierte id a tipo de dato int
cast = cast[cast['id_film'].isin(movies["id"])]
print('el total filas en el dataset cast es ', cast.__len__())

el total filas en el dataset cast es  313710


In [197]:
# Se procede a realizar lo mismo en el dataset crew
crew = crew[crew['id_film'].isin(movies["id"])]
print('el total filas en el dataset crew es ', crew.__len__())

el total filas en el dataset crew es  300250
