# ETL


In [16]:
import pandas as pd
import ast


In [18]:
movies = pd.read_csv("data/movies_dataset.csv")
movies.head(2)

  movies = pd.read_csv("data/movies_dataset.csv")


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [19]:
movies_df = movies.copy()

In [21]:
movies_df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [173]:
movies_df.loc[:,"belongs_to_collection"].dtype

dtype('O')

In [204]:
belongs_to_collections_df = movies_df.loc[movies_df["belongs_to_collection"].isnull() == False, "belongs_to_collection"]
belongs_to_collections_df.shape

(4494,)

In [205]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return [{}]

In [206]:
belongs_to_collections_df = belongs_to_collections_df.apply(safe_literal_eval)

In [207]:
belongs_to_collections_df.head()

0     {'id': 10194, 'name': 'Toy Story Collection', ...
2     {'id': 119050, 'name': 'Grumpy Old Men Collect...
4     {'id': 96871, 'name': 'Father of the Bride Col...
9     {'id': 645, 'name': 'James Bond Collection', '...
12    {'id': 117693, 'name': 'Balto Collection', 'po...
Name: belongs_to_collection, dtype: object

In [209]:
original_index = belongs_to_collections_df.index

In [210]:
belongs_to_collections_df = belongs_to_collections_df.reset_index(drop=True)

In [211]:
pd.DataFrame(belongs_to_collections_df.tolist())

AttributeError: 'float' object has no attribute 'keys'

### Data Types Consistency
Sometimes, there could be some unexpected data in a field, either a string datatype in a numeric field or a numeric dtype in a string field. It is important to check datatype consistency on each field. And the bellow ad-hoc function will help us to do that a bit more easy. 

##### What it does? 

Answere: It returns the indexes of intrusive rows in the series, which it'll be used as a mask. That will allow us to: 
 * Have a visualization of the inconsistencies and 
 * The rows position that we would like to impute them.

In [212]:
def dtype_checker(data: pd.DataFrame, column: str, data_type) -> list:
    """
    Returns an array of indexes of rows with a different data type in the specified column.

    Parameters:
        data (pd.DataFrame or pd.Series): The DataFrame or Series to check.
        column (str): The name of the column to check for data type.
        data_type: The expected data type for the values in the column.

    Returns:
        list: An array of indexes of rows where the data type in the specified column is different from the expected data type.
    """

    invalid_dtype_rows = []

    if isinstance(data, pd.DataFrame):
        for idx, value in enumerate(data[column].values):
            if not isinstance(value, data_type):
                invalid_dtype_rows.append(idx)
    elif isinstance(data, pd.Series):
        for idx, value in enumerate(data.values):
            if not isinstance(value, data_type):
                invalid_dtype_rows.append(idx)
    else:
        raise ValueError("Invalid input data type. The data must be a pandas DataFrame or Series.")

    return invalid_dtype_rows

In [213]:
dtype_checker(belongs_to_collections_df, None, dict)

[2371, 3313, 3812]

In [214]:
belongs_to_collections_df.loc[dtype_checker(belongs_to_collections_df, None, dict)]

2371    0.065736
3313    1.931659
3812    2.185485
Name: belongs_to_collection, dtype: object

In [215]:
belongs_to_collections_df.drop(dtype_checker(belongs_to_collections_df, None, dict), inplace=True)

In [216]:
belongs_to_collections_df

0       {'id': 10194, 'name': 'Toy Story Collection', ...
1       {'id': 119050, 'name': 'Grumpy Old Men Collect...
2       {'id': 96871, 'name': 'Father of the Bride Col...
3       {'id': 645, 'name': 'James Bond Collection', '...
4       {'id': 117693, 'name': 'Balto Collection', 'po...
                              ...                        
4489    {'id': 37261, 'name': 'The Carry On Collection...
4490    {'id': 37261, 'name': 'The Carry On Collection...
4491    {'id': 37261, 'name': 'The Carry On Collection...
4492    {'id': 477208, 'name': 'DC Super Hero Girls Co...
4493    {'id': 200641, 'name': 'Red Lotus Collection',...
Name: belongs_to_collection, Length: 4491, dtype: object

In [217]:
collections_df = pd.DataFrame(belongs_to_collections_df.tolist())

In [221]:
collections_df.set_index(original_index)

ValueError: Length mismatch: Expected 4491 rows, received array of length 4494

In [220]:
movies_df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
