# Proyecto Individual 01 - Machine Learning Operations (MLOps)

Created By: Roy Quillca

SoluciÃ³n de problemas de modelos de recomendaciÃ³n en un entorno de streaming, desde la transformaciÃ³n de datos hasta el mantenimiento del modelo. Desarrollo de un MVP como Data Scientist en una start-up de agregaciÃ³n de plataformas de streaming.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Importacion de librerías
import pandas as pd
import numpy as np

# Gestionar las rutas
import pi_mlops.utils.paths as path

In [3]:
data_dir = path.make_dir_function("data")
raw_data_dir = data_dir("raw")
csv_file = raw_data_dir.glob("*.csv")

In [4]:
csv_dict = {}
for csv_file in csv_file:
    csv_dict[f"{csv_file.name.split('_')[0]}_v{0}"] = csv_file

In [5]:
# Lectura de datos en dataframe
dict_data = {}
for file_name, file_dir in zip(csv_dict.keys(), csv_dict.values()):
    df = pd.read_csv(file_dir)
    dict_data[file_name] = df

# **Transformaciones**

_Generar campo id: Cada id se compondrá de la primera letra del nombre de la plataforma, seguido del show_id ya presente en los datasets (ejemplo para títulos de Amazon = as123)_

In [6]:
#------------------#
# Transformación 1:
#------------------#
def generate_id_show(dict_df: dict):
    for df_name , df in zip(dict_df.keys(), dict_df.values()):
        # Aplicar la transformacion únicamente cuando no está hecho aún "show_id" + "primera_letra_plataforma"
        if df_name[0] not in df["show_id"].str[0].values:
            df['show_id'] = df_name[0] + df['show_id']
            dict_df[df_name] = df
    return dict_df

In [7]:
# Aplicando la transformacion 1
dict_data = generate_id_show(dict_data)

## **Transformacion 2**

Los valores nulos del campo rating deberán reemplazarse por el string “G” (corresponde al maturity rating: “general for all audiences”


In [8]:
dict_data["amazon_v0"].head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,as1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...
1,as2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",India,"March 30, 2021",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...
2,as3,Movie,Secrets of Deception,Josh Webber,"Tom Sizemore, Lorenzo Lamas, Robert LaSardo, R...",United States,"March 30, 2021",2017,,74 min,"Action, Drama, Suspense",After a man discovers his wife is cheating on ...
3,as4,Movie,Pink: Staying True,Sonia Anderson,"Interviews with: Pink, Adele, Beyoncé, Britney...",United States,"March 30, 2021",2014,,69 min,Documentary,"Pink breaks the mold once again, bringing her ..."
4,as5,Movie,Monster Maker,Giles Foster,"Harry Dean Stanton, Kieran O'Brien, George Cos...",United Kingdom,"March 30, 2021",1989,,45 min,"Drama, Fantasy",Teenage Matt Banting wants to work with a famo...


In [9]:
def fillna_on_rating(dict_data):
    for df_name,df in  zip(dict_data.keys(),dict_data.values()):
        nan_vals = df['rating'].isna().sum()
        if nan_vals > 0:
            print(f"{df_name}: {nan_vals} valores nulos/nan afectados.")
            df['rating'] = df['rating'].fillna('g')
            dict_data[df_name] = df
        print(f"{df_name}: {nan_vals} valores nulos.")
    return dict_data

In [10]:
dict_data = fillna_on_rating(dict_data)

amazon_v0: 337 valores nulos/nan afectados.
amazon_v0: 337 valores nulos.
disney_v0: 3 valores nulos/nan afectados.
disney_v0: 3 valores nulos.
hulu_v0: 520 valores nulos/nan afectados.
hulu_v0: 520 valores nulos.
netflix_v0: 4 valores nulos/nan afectados.
netflix_v0: 4 valores nulos.


In [11]:
dict_data["amazon_v0"]["rating"].isna().sum()

0

## **Transformación 3**

In [12]:
def change_date_added_datetime(dict_data):
    for df_name, df in zip(dict_data.keys(), dict_data.values()):
        df['date_added'] = pd.to_datetime(df['date_added'].str.strip(),  format='%B %d, %Y').dt.strftime('%Y-%m-%d')
        dict_data[df_name] = df
    return dict_data

In [13]:
# Apliación de la transformación 3
dict_data = change_date_added_datetime(dict_data)

## **Transformacion 4**

In [14]:
def convert_to_lower(dict_data):
    for df_name, df in zip(dict_data.keys(), dict_data.values()):
        object_columns_list= df.dtypes[df.dtypes == 'object'].index.to_list()
        for col in object_columns_list:
            df[col] = df[col].str.lower()
        dict_data[df_name] = df
    return dict_data

In [15]:
dict_data = convert_to_lower(dict_data)

## **Transformaion 5**

In [16]:
def normalize_duration(dict_data):
    for df_name, df in zip(dict_data.keys(), dict_data.values()):    
        df[['duration_int', 'duration_type']] = df['duration'].str.split(' ', expand=True)
        df['duration_type'] = df['duration_type'].str.replace('seasons', 'season')
        df['duration_int'] = pd.to_numeric(df['duration_int'], downcast='integer', errors='coerce')
        # df['duration_int'] = df['duration_int'].astype(int)
        dict_data[df_name] = df
    return dict_data

In [17]:
# Aplicación de la transformacion 5
dict_data = normalize_duration(dict_data)

In [18]:
print(list(dict_data['netflix_v0'].columns))
print(list(dict_data['hulu_v0'].columns))
print(list(dict_data['disney_v0'].columns))
print(list(dict_data["amazon_v0"].columns))

['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'duration_int', 'duration_type']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'duration_int', 'duration_type']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'duration_int', 'duration_type']
['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'description', 'duration_int', 'duration_type']


In [19]:
def concat_dataframes(dict_data):
    movies = pd.concat([dict_data['amazon_v0'], dict_data['disney_v0'], dict_data['hulu_v0'], dict_data['netflix_v0']],axis=0)
    return movies

In [20]:
# Concatenación de dataframes
df_movies = concat_dataframes(dict_data)

In [21]:
# Reordenar las columnas del dataframe
def reorder_cols(df):
    reordered_cols = ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added','release_year', 'rating', 'duration', 'duration_int', 'duration_type', 'listed_in', 'description']
    df = df[reordered_cols].reset_index(drop=True)
    return df

In [22]:
# Reordenar columnas
df_movies = reorder_cols(df_movies)

In [33]:
# Exportación de la data
def save_as_parquet(df,file_name):
    data_proc_dir = path.make_dir_function(["data","interim"])
    file_path = data_proc_dir(file_name)
    df.to_parquet(f"{file_path}.parquet")

In [34]:
# Guardar en 
save_as_parquet(df_movies, "movies_v1")