# <h1 align=center>**`Movies Score - Data Engineering - PI`**</h1>

## ``Importación de librerías``

In [60]:
import pandas as pd
import numpy as np
# mostrar todas las filas y columnas
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## ``Carga de datasets``

In [61]:
amazon = pd.read_csv('../app/Datasets/amazon_prime_titles-score.csv')
disney = pd.read_csv('../app/Datasets/disney_plus_titles-score.csv')
hulu = pd.read_csv('../app/Datasets/hulu_titles-score (2).csv')
netflix = pd.read_csv('../app/Datasets/netflix_titles-score.csv')

## **Propuesta de trabajo (requerimientos de aprobación)**

**`Transformaciones`**:  El analista de datos requiere estas, ***y solo estas***, transformaciones para sus datos:


+ **Consigna 1:** Generar campo **`id`**: Cada id se compondrá de la primera letra del nombre de la plataforma, seguido del show_id ya presente en los datasets (ejemplo para títulos de Amazon = **`as123`**)

+ **Consigna 2:** Los valores nulos del campo rating deberán reemplazarse por el string “**`G`**” (corresponde al maturity rating: “general for all audiences”

+ **Consigna 3:** De haber fechas, deberán tener el formato **`AAAA-mm-dd`**

+ **Consigna 4:** Los campos de texto deberán estar en **minúsculas**, sin excepciones

+ **Consigna 5:** El campo ***duration*** debe convertirse en dos campos: **`duration_int`** y **`duration_type`**. El primero será un integer y el segundo un string indicando la unidad de medición de duración: min (minutos) o season (temporadas)

### ``Consigna 1``

In [62]:
def generate_id_show(initial_name, df):
    df['show_id'] = initial_name + df['show_id']
    return df

In [63]:
amazon = generate_id_show('a', amazon)
disney = generate_id_show('d', disney)
hulu = generate_id_show('h', hulu)
netflix = generate_id_show('n', netflix)

### ``Consigna 2``

In [64]:
# amazon[~amazon.isna()]
amazon['rating'].unique()

array([nan, '13+', 'ALL', '18+', 'R', 'TV-Y', 'TV-Y7', 'NR', '16+',
       'TV-PG', '7+', 'TV-14', 'TV-NR', 'TV-G', 'PG-13', 'TV-MA', 'G',
       'PG', 'NC-17', 'UNRATED', '16', 'AGES_16_', 'AGES_18_', 'ALL_AGES',
       'NOT_RATE'], dtype=object)

In [65]:
amazon['rating'] = amazon['rating'].fillna('g')
disney['rating'] = disney['rating'].fillna('g')
hulu['rating'] = hulu['rating'].fillna('g')
netflix['rating'] = netflix['rating'].fillna('g')

In [66]:
amazon.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'score'],
      dtype='object')

### ``Consigna 3``

In [67]:
def change_date_added_datetime(df):
    # pd.to_datetime(amazon['date_added'].str.strip(),  format='%B %d, %Y').dt.strftime('%Y-%m-%d')
    # str.strip(): Netflix tuvo datos con espacios en la columna date_added
    df['date_added'] = pd.to_datetime(df['date_added'].str.strip(),  format='%B %d, %Y').dt.strftime('%Y-%m-%d')
    return df

In [68]:
amazon = change_date_added_datetime(amazon)
disney = change_date_added_datetime(disney)
hulu = change_date_added_datetime(hulu)
netflix = change_date_added_datetime(netflix)

### ``Consigna 4``

In [69]:
def convert_to_lower(df):
    object_columns_list= df.dtypes[df.dtypes == 'object'].index.to_list()
    for col in object_columns_list:
        df[col] = df[col].str.lower()
    return df

In [70]:
amazon = convert_to_lower(amazon)
disney = convert_to_lower(disney)
hulu = convert_to_lower(hulu)
netflix = convert_to_lower(amazon)

### ``Consigna 5``

gna 5:** El campo ***duration*** debe convertirse en dos campos: **`duration_int`** y **`duration_type`**. El primero será un integer y el segundo un string indicando la unidad de medición de duración: min (minutos) o season (temporadas)

In [71]:
amazon['duration'].unique() # Todos se pueden convertir a tipo entero (int), no hay caracteres alfabéticos u otros que no sean numéricos.

array(['113 min', '110 min', '74 min', '69 min', '45 min', '52 min',
       '98 min', '131 min', '87 min', '92 min', '88 min', '93 min',
       '94 min', '46 min', '96 min', '1 season', '104 min', '62 min',
       '50 min', '3 seasons', '2 seasons', '86 min', '36 min', '37 min',
       '103 min', '9 min', '18 min', '14 min', '20 min', '19 min',
       '22 min', '60 min', '6 min', '54 min', '5 min', '84 min',
       '126 min', '125 min', '109 min', '89 min', '85 min', '56 min',
       '40 min', '111 min', '33 min', '34 min', '95 min', '99 min',
       '78 min', '4 seasons', '77 min', '55 min', '53 min', '115 min',
       '58 min', '49 min', '135 min', '91 min', '64 min', '59 min',
       '48 min', '122 min', '90 min', '102 min', '65 min', '114 min',
       '136 min', '70 min', '138 min', '100 min', '480 min', '4 min',
       '30 min', '152 min', '68 min', '57 min', '7 seasons', '31 min',
       '151 min', '149 min', '9 seasons', '141 min', '121 min', '79 min',
       '140 min', '51 min'

In [72]:
def normalize_duration(df):
    df[['duration_int', 'duration_type']] = df['duration'].str.split(' ', expand=True)
    df['duration_type'] = df['duration_type'].str.replace('seasons', 'season')
    df['duration_int'] = pd.to_numeric(df['duration_int'], downcast='integer', errors='coerce')
    return df

In [73]:
amazon = normalize_duration(amazon)
disney = normalize_duration(disney)
hulu = normalize_duration(hulu)
netflix = normalize_duration(netflix)

## ``Concatenación de dataframes``

In [74]:
movies = pd.concat([amazon, disney, hulu, netflix],axis=0)
movies.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,score,duration_int,duration_type
0,as1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,113 min,"comedy, drama",a small fishing village must procure a local d...,99,113.0,min
1,as2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,110 min,"drama, international",a metro family decides to fight a cyber crimin...,37,110.0,min


In [75]:
# Reordenar las columnas del dataframe
reordered_cols = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added','release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'listed_in', 'description','score']
movies[reordered_cols].head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,duration_int,duration_type,listed_in,description,score
0,as1,movie,the grand seduction,don mckellar,"brendan gleeson, taylor kitsch, gordon pinsent",canada,2021-03-30,2014,g,113 min,113.0,min,"comedy, drama",a small fishing village must procure a local d...,99
1,as2,movie,take care good night,girish joshi,"mahesh manjrekar, abhay mahajan, sachin khedekar",india,2021-03-30,2018,13+,110 min,110.0,min,"drama, international",a metro family decides to fight a cyber crimin...,37


In [76]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23859 entries, 0 to 9667
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   show_id        23859 non-null  object 
 1   type           23859 non-null  object 
 2   title          23859 non-null  object 
 3   director       16152 non-null  object 
 4   cast           18130 non-null  object 
 5   country        4195 non-null   object 
 6   date_added     4802 non-null   object 
 7   release_year   23859 non-null  int64  
 8   rating         23859 non-null  object 
 9   duration       23380 non-null  object 
 10  listed_in      23859 non-null  object 
 11  description    23855 non-null  object 
 12  score          23859 non-null  int64  
 13  duration_int   23380 non-null  float64
 14  duration_type  23380 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 2.9+ MB


In [77]:
# Exportar a JSON: convierte el DataFrame a JSON con orientación 'records' (lista de diccionarios)
movies_df = movies.to_json(orient='records')
# Guarda el JSON en un archivo .json
with open('app/src/db/movies_scores.json','w') as file:
    file.write(movies_df)

<h3 align='center'><b>Fin</b></h3>
<h3 align='center'><b>Gracias por leer</b></h3>