# ETL


In [3]:
import pandas as pd
import ast

In [6]:
#movies = pd.read_csv("data/movies_dataset.csv")
#credits = pd.read_csv("data/credits.csv")
#movies["popularity"] = movies["popularity"].astype(str)
#movies.to_parquet("parquet_data/movies_parquet.parquet")
#credits.to_parquet("parquet_data/credits_parquet.parquet")

  movies = pd.read_csv("data/movies_dataset.csv")


In [13]:
movies_parquet = pd.read_parquet("parquet_data/movies_parquet.parquet")

In [14]:
movies_parquet.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,0.0,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,0.0,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,0.0,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,0.0,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,0.0,5.7,173.0


In [15]:
movies_df = movies_parquet.copy()

In [16]:
movies_df.isnull().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   0
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [233]:
movies_df.loc[:,"belongs_to_collection"].dtype

dtype('O')

In [234]:
belongs_to_collections_df = movies_df.loc[movies_df["belongs_to_collection"].isnull() == False, "belongs_to_collection"]
belongs_to_collections_df.shape

(4494,)

In [235]:
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return [{}]

In [236]:
belongs_to_collections_df = belongs_to_collections_df.apply(safe_literal_eval)

In [237]:
belongs_to_collections_df.head()

0     {'id': 10194, 'name': 'Toy Story Collection', ...
2     {'id': 119050, 'name': 'Grumpy Old Men Collect...
4     {'id': 96871, 'name': 'Father of the Bride Col...
9     {'id': 645, 'name': 'James Bond Collection', '...
12    {'id': 117693, 'name': 'Balto Collection', 'po...
Name: belongs_to_collection, dtype: object

In [238]:
pd.DataFrame(belongs_to_collections_df.tolist())

AttributeError: 'float' object has no attribute 'keys'

### Data Types Consistency
Sometimes, there could be some unexpected data in a field, either a string datatype in a numeric field or a numeric dtype in a string field. It is important to check datatype consistency on each field. And the bellow ad-hoc function will help us to do that a bit more easy. 

##### What it does? 

Answere: It returns the indexes of intrusive rows in the series, which it'll be used as a mask. That will allow us to: 
 * Have a visualization of the inconsistencies and 
 * The rows position that we would like to impute them.

In [246]:
def dtype_checker(data: pd.DataFrame | pd.Series, column: str, data_type) -> list:
    """
    Returns an array of indexes of rows with a different data type in the specified column.

    Parameters:
        data (pd.DataFrame or pd.Series): The DataFrame or Series to check.
        column (str): The name of the column to check for data type.
        data_type: The expected data type for the values in the column.

    Returns:
        list: An array of indexes of rows where the data type in the specified column is different from the expected data type.
    """

    invalid_dtype_rows = []

    if isinstance(data, pd.DataFrame):
        for row in data[column].items():
            if not isinstance(row[1], data_type):
                invalid_dtype_rows.append(row[0])
    elif isinstance(data, pd.Series):
        for row in data.items():
            if not isinstance(row[1], data_type):
                invalid_dtype_rows.append(row[0])
    else:
        raise ValueError("Invalid input data type. The data must be a pandas DataFrame or Series.")

    return invalid_dtype_rows

In [247]:
dtype_checker(belongs_to_collections_df, None, dict)

[]

In [241]:
belongs_to_collections_df.loc[dtype_checker(belongs_to_collections_df, None, dict)]

19730    0.065736
29503    1.931659
35587    2.185485
Name: belongs_to_collection, dtype: object

In [242]:
belongs_to_collections_df.drop(dtype_checker(belongs_to_collections_df, None, dict), inplace=True)

In [243]:
belongs_to_collections_df

0        {'id': 10194, 'name': 'Toy Story Collection', ...
2        {'id': 119050, 'name': 'Grumpy Old Men Collect...
4        {'id': 96871, 'name': 'Father of the Bride Col...
9        {'id': 645, 'name': 'James Bond Collection', '...
12       {'id': 117693, 'name': 'Balto Collection', 'po...
                               ...                        
45355    {'id': 37261, 'name': 'The Carry On Collection...
45358    {'id': 37261, 'name': 'The Carry On Collection...
45369    {'id': 37261, 'name': 'The Carry On Collection...
45371    {'id': 477208, 'name': 'DC Super Hero Girls Co...
45382    {'id': 200641, 'name': 'Red Lotus Collection',...
Name: belongs_to_collection, Length: 4491, dtype: object

In [244]:
collections_df = pd.DataFrame(belongs_to_collections_df.tolist())

In [245]:
collections_df

Unnamed: 0,id,name,poster_path,backdrop_path
0,10194,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg
1,119050,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg
2,96871,Father of the Bride Collection,/nts4iOmNnq7GNicycMJ9pSAn204.jpg,/7qwE57OVZmMJChBpLEbJEmzUydk.jpg
3,645,James Bond Collection,/HORpg5CSkmeQlAolx3bKMrKgfi.jpg,/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg
4,117693,Balto Collection,/w0ZgH6Lgxt2bQYnf1ss74UvYftm.jpg,/9VM5LiJV0bGb1st1KyHA3cVnO2G.jpg
...,...,...,...,...
4486,37261,The Carry On Collection,/2P0HNrYgKDvirV8RCdT1rBSJdbJ.jpg,/38tF1LJN7ULeZAuAfP7beaPMfcl.jpg
4487,37261,The Carry On Collection,/2P0HNrYgKDvirV8RCdT1rBSJdbJ.jpg,/38tF1LJN7ULeZAuAfP7beaPMfcl.jpg
4488,37261,The Carry On Collection,/2P0HNrYgKDvirV8RCdT1rBSJdbJ.jpg,/38tF1LJN7ULeZAuAfP7beaPMfcl.jpg
4489,477208,DC Super Hero Girls Collection,,


In [249]:
movies_df.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [None]:
movies_df.drop(["belongs_to_collection"], axis=1, inplace=True)

In [253]:
movies_df.head(2)

Unnamed: 0,adult,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [257]:
pd.merge(collections_df, movies_df, left_index=True, right_index=True, how="inner").head(2)

Unnamed: 0,id_x,name,poster_path_x,backdrop_path,adult,budget,genres,homepage,id_y,imdb_id,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,10194,Toy Story Collection,/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg,/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,119050,Grumpy Old Men Collection,/nLvUdqgPgm3F85NMCii9gVFUcet.jpg,/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [262]:
movies_df["genres"] = movies_df["genres"].apply(safe_literal_eval)

In [284]:
movies_df["genres"].head()

0    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2    [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                       [{'id': 35, 'name': 'Comedy'}]
Name: genres, dtype: object

In [277]:
{'id': 16, 'name': 'Animation'} in movies_df["genres"][0]

True

In [281]:
movies_df["genres"][0][0].get("name")

'Animation'

`df_credit_copy["crew"].apply(lambda movie_crew: [member for member in movie_crew if member.get('job') == 'Director'])`

In [287]:
movies_df["genres"].apply(lambda genres: [genre for genre in genres if genre.get("name") == "Comedy"])

0        [{'id': 35, 'name': 'Comedy'}]
1                                    []
2        [{'id': 35, 'name': 'Comedy'}]
3        [{'id': 35, 'name': 'Comedy'}]
4        [{'id': 35, 'name': 'Comedy'}]
                      ...              
45461                                []
45462                                []
45463                                []
45464                                []
45465                                []
Name: genres, Length: 45466, dtype: object

In [288]:
movies_df["genres"][2]

[{'id': 10749, 'name': 'Romance'}, {'id': 35, 'name': 'Comedy'}]