In [81]:
# Importing requiered libraries
import numpy as np
import pandas as pd
import importlib
import utils.validation

# Reload the functions module
importlib.reload(utils.validation)

<module 'utils.validation' from 's:\\Courses\\Data Science\\0. Henry\\M7 - Labs - Project MLOps\\recommendation-system-movies-similarity-and-rating\\data\\utils\\validation.py'>

# Data Transformation Process

#### Importing files

In [82]:
# Importing movies_dataset.csv file
dataset_path_1 = "raw data\movies_dataset.csv"
dataset_path_2 = "raw data\credits.csv"

# Setting low_memory=False - To read the entire file so it determines the data types accurately. 
# Note: It can increase memory usage for larger datasets
info_movies = pd.read_csv(dataset_path_1, low_memory=False)
info_credits = pd.read_csv(dataset_path_2, low_memory=False)

#### Merge files in one dataset

In [83]:
# Checking number of rows for each file
print("rows in movies:", info_movies.shape[0])
print("rows in credits:", info_credits.shape[0])

rows in movies: 45466
rows in credits: 45476


In [84]:
# Checking for duplicates in id movies
duplicates_movies = utils.validation.count_duplicates(info_movies['id'])
print("Duplicated id values in movies", duplicates_movies)

Duplicated id values in movies id
141971    2
105045    1
11115     1
12600     1
10991     1
97995     1
298721    1
99080     1
159849    1
4912      1
15028     1
132641    1
42495     1
168538    1
69234     1
110428    1
5511      1
25541     1
18440     1
152795    1
119916    1
84198     1
109962    1
77221     1
13209     1
22649     1
14788     1
23305     1
265189    1
Name: count, dtype: int64


In [85]:
# Removing duplicates in id movies
info_movies_2 = utils.validation.remove_duplicates(info_movies, 'id')

In [86]:
# Checking for duplicates in id credits
duplicates_credits = utils.validation.count_duplicates(info_credits["id"])
print("Duplicated id values in movies", duplicates_credits)

Duplicated id values in movies id
141971    2
105045    1
42495     1
157301    1
9755      1
123634    1
8767      1
43629     1
187156    1
69234     1
168538    1
132641    1
11752     1
15028     1
4912      1
159849    1
99080     1
298721    1
97995     1
10991     1
12600     1
142563    1
24026     1
5511      1
24023     1
23305     1
14788     1
22649     1
13209     1
77221     1
109962    1
84198     1
119916    1
152795    1
18440     1
25541     1
110428    1
11115     1
116723    1
3057      1
125458    1
199591    1
265189    1
Name: count, dtype: int64


In [87]:
# Removing duplicates in id credits
info_credits_2 = utils.validation.remove_duplicates(info_credits, 'id')

In [88]:
# Checking data type of the values and the amount of each
value_counts_movies = info_movies_2["id"].value_counts()
print("id movies:", value_counts_movies.index.to_series().apply(type).value_counts())

value_counts_credits = info_credits_2["id"].value_counts()
print("id credits:", value_counts_credits.index.to_series().apply(type).value_counts())

id movies: id
<class 'str'>    45436
Name: count, dtype: int64
id credits: id
<class 'int'>    45432
Name: count, dtype: int64


In [89]:
# Checking for null values
null_count_movies = info_movies_2["id"].isnull().sum()
print("Null values in movies:", null_count_movies)

null_count_credits = info_credits_2["id"].isnull().sum()
print("Null values in credits:", null_count_credits)

Null values in movies: 0
Null values in credits: 0


In [90]:
# Checking for blank values
blank_count_movies = utils.validation.count_blanks(info_movies_2, "id")
print("Null values in movies:", blank_count_movies)

blank_count_credits = utils.validation.count_blanks(info_credits_2, "id")
print("Null values in credits:", blank_count_credits)

Null values in movies: 0
Null values in credits: 0


##### Working with id values from movies dataset

In [91]:
# Some str values are date time pattern
# Using data_pattern function created and stored in validation
id_invalid_values = utils.validation.date_pattern(info_movies_2["id"], 0)
id_invalid_values

['1997-08-20', '2012-09-29', '2014-01-01']

In [92]:
# Deleting str values 
# These values are removed because they are not sensitive data that affect the creation of the movie recommendation system.
info_movies_2 = info_movies_2[~info_movies_2["id"].isin(id_invalid_values)]

In [93]:
# Transforming values to numeric type using convert_to_numeric function created and stored in validation
info_movies_2 = utils.validation.convert_to_numeric(info_movies_2, "id")

In [94]:
# Checking data type of the values and the amount of each
value_counts_movies_2 = info_movies_2["id"].value_counts()
print("id movies:", value_counts_movies_2.index.to_series().apply(type).value_counts())

id movies: id
<class 'int'>    45433
Name: count, dtype: int64


##### Working with id values of credits

In [95]:
# Checking the amount of unique values
unique_values_credits = info_credits_2["id"].unique()

print("Number of rows:", info_credits_2["id"].shape[0])
print("Number of unique values:",unique_values_credits.shape[0])

Number of rows: 45432
Number of unique values: 45432


In [None]:
# These ids don't need any change

##### Merging files

In [96]:
# Merge the datasets based on the common "id" column
data_movies = pd.merge(info_movies_2, info_credits_2, on="id", how="left")

In [97]:
utils.validation.count_duplicates(data_movies["id"])

Series([], Name: count, dtype: int64)

#### Delete unimportant columns

In [98]:
# Deleting columns (video,imdb_id,adult,original_title,poster_path y homepage)
columns_to_drop = ["video", "imdb_id", "adult", "original_title", "poster_path", "homepage"]
data_movies_1 = data_movies.drop(columns=columns_to_drop)

#### Replace null values in "revenue" and "budget"

In [99]:
# Replacing null values with 0 in "revenue" and "budget"
# Later on will be created an inversion return column, thats why these values can't be null
columns_to_replace = ["revenue", "budget"]
data_movies_1[columns_to_replace] = data_movies_1[columns_to_replace].fillna(0)

#### Standardize "release date" values

In [100]:
# Check the number of rows that will be eliminated, to see if the process may affect considerably
null_count = data_movies_1["release_date"].isnull().sum()
print("Number of null values in 'release_date':", null_count)
print("Number of rows in table:", data_movies_1.shape[0])

Number of null values in 'release_date': 87
Number of rows in table: 45433


In [101]:
# Deleting rows with null values in release_date
data_movies_2 = data_movies_1.dropna(subset=["release_date"])

In [102]:
# Checking the date time type of the values from the release date column
# To verify if values are standardized or not ("object" will mean they're not)
data_type = data_movies_2["release_date"].dtype
print("Data type of 'release_date' column:", data_type)

Data type of 'release_date' column: object


Standardizing "realease_date" to date format (YYYY-mm-dd)

In [103]:
# Checking the amount of values that have unique values
unique_values = data_movies_2["release_date"].unique()

print("Number of rows:", data_movies_2["release_date"].shape[0])
print("Number of unique values:",unique_values.shape[0])

Number of rows: 45346
Number of unique values: 17333


In [104]:
# Checking data type of unique values and the amount of each
value_counts = data_movies_2["release_date"].value_counts()

print(value_counts.index.to_series().apply(type).value_counts())

release_date
<class 'str'>    17333
Name: count, dtype: int64


In [105]:
# Having only "str" type on unique values
# Identify values that do not have the format "yyyy-mm-dd", using the function date_pattern created and stored in validation 
values_without_pattern = utils.validation.date_pattern(data_movies_2["release_date"], 1)
values_without_pattern

[]

In [106]:
# All values have "yyyy-mm-dd" patter, transforming all values to datetime type

data_movies_3 = data_movies_2.copy()

# Convert "release_date" column to datetime
data_movies_3["release_date"] = pd.to_datetime(data_movies_3["release_date"]) 

In [107]:
# Checking data type of release_date
date_type = data_movies_3["release_date"].dtype
print("dates", date_type)

dates datetime64[ns]


#### Create "release_year" with year value from "release_date"

In [108]:
# Creating the column "release_year" using the year from "release_date"
data_movies_3["release_year"] = data_movies_3["release_date"].dt.year

In [109]:
# Checking data type of release_year
year_type = data_movies_3["release_year"].dtype
print("year", year_type)

year int32


#### Create "return" column

In [110]:
# Checking the type of the values from "revenue" and "budget"
print("revenue type:", data_movies_3["revenue"].dtype)
print("budget type:", data_movies_3["budget"].dtype)

revenue type: float64
budget type: object


##### Transforming budget values

In [111]:
# Checking the type of values from "budget", and how many of them are in the column
budget_data_types = data_movies_3["budget"].apply(type).value_counts()

print(budget_data_types)
print("Total of rows:", data_movies_3["budget"].shape[0])

budget
<class 'str'>    45346
Name: count, dtype: int64
Total of rows: 45346


In [112]:
# Transforming "budget" values to numeric type, using convert_to_numeric funtion created and stored in validation
data_movies_4 = utils.validation.convert_to_numeric(data_movies_3, "budget")

In [113]:
# Checking "budget" values are numeric type
print("budget type:", data_movies_4["budget"].dtype)

budget type: int64


##### Creating return column

In [114]:
# Creating the "return" column ("revenue" / "budget")
data_movies_4["return"] = data_movies_4["revenue"] / data_movies_4["budget"]
data_movies_4.loc[(data_movies_4["revenue"] == 0) | (data_movies_4["budget"] == 0), "return"] = 0

#### Unnest data from columns

##### Working with "crew" values

Analyzing values

In [115]:
# Total number of values
data_movies_4["crew"].shape[0]

45346

In [116]:
# Checking amount of data type
value_counts_crew = data_movies_4["crew"].value_counts()
print("id movies:", value_counts_crew.index.to_series().apply(type).value_counts())

id movies: crew
<class 'str'>    44623
Name: count, dtype: int64


In [117]:
# Total number of blank values
blanks_crew = utils.validation.count_blanks(data_movies_4, "crew")
print(blanks_crew)

1


In [118]:
# Total number of duplicates
duplicates_crew = utils.validation.count_duplicates(data_movies_4["crew"])
print(duplicates_crew)

crew
[]    722
Name: count, dtype: int64


In [119]:
# After checking the data types
# Replace "[]" with Empty space
data_movies_5 = data_movies_4.copy()
data_movies_5["crew"] = data_movies_5["crew"].replace("[]", "")

In [120]:
utils.validation.replace_nan_with_empty_string(data_movies_5, "crew")

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,release_year,return
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995,12.451801
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995,4.043035
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",1995,0.000000
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",1995,5.090760
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",1995,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Robin Hood,5.7,26.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode'...","[{'credit_id': '52fe44439251416c9100a899', 'de...",1991,0.000000
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,9.0,3.0,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",2011,0.000000
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,3.8,6.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",2003,0.000000
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],Released,,Satan Triumphant,0.0,0.0,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",1917,0.000000


In [121]:
# Total number of blank values
utils.validation.get_column_summary(data_movies_5, "crew")

(crew
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

Checking valid python syntax

In [122]:
# Check valid python syntax for crew column
valid_crew = utils.validation.check_valid_expression(data_movies_5, "crew")
valid_crew

{'valid_count': 44622,
 'invalid_count': 724,
 'nan_count': 0,
 'blank_count': 724,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [123]:
data_movies_6 = data_movies_5.copy()

# Ectracting nested values from crew
column_name = "crew"
keys = ["id", "name", "gender", "department", 'job']
new_columns = ["crew_id", "crew_name", "crew_gender", "crew_department", "crew_job"]

In [124]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,vote_count,cast,crew,release_year,return,crew_id,crew_name,crew_gender,crew_department,crew_job
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995,12.451801,"[7879, 12891, 7, 12892, 12893, 12894, 12895, 1...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, ...","[Directing, Writing, Writing, Writing, Writing...","[Director, Screenplay, Screenplay, Screenplay,..."
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995,4.043035,"[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2]","[Production, Writing, Sound, Directing, Editin...","[Executive Producer, Screenplay, Original Musi..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",1995,0.000000,"[26502, 16837, 16837, 1551320]","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[2, 2, 2, 2]","[Directing, Writing, Writing, Crew]","[Director, Characters, Writer, Sound Recordist]"
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",1995,5.090760,"[2178, 5144, 5144, 21968, 70592, 111118, 11111...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[2, 0, 0, 2, 1, 0, 0, 0, 2, 0]","[Directing, Writing, Production, Production, P...","[Director, Screenplay, Producer, Producer, Pro..."
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",1995,0.000000,"[37, 5506, 17698, 17698, 26160, 56106, 68755]","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[2, 2, 1, 1, 2, 2, 2]","[Sound, Camera, Writing, Production, Writing, ...","[Original Music Composer, Director of Photogra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,26.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode'...","[{'credit_id': '52fe44439251416c9100a899', 'de...",1991,0.000000,"[17784, 1471628, 39812, 1471628, 18073, 10716,...","[John Irvin, Sam Resnick, John McGrath, Sam Re...","[2, 0, 2, 0, 1, 0, 0, 2, 1]","[Directing, Writing, Writing, Writing, Product...","[Director, Writer, Writer, Story, Producer, Mu..."
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,3.0,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",2011,0.000000,"[1051381, 1051381, 1728582, 1051381, 1051381, ...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[0, 0, 0, 0, 0, 0]","[Directing, Writing, Art, Sound, Editing, Crew]","[Director, Writer, Production Design, Music, E..."
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,6.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",2003,0.000000,"[67753, 19713, 549355, 549356, 58818]","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...","[2, 2, 2, 0, 2]","[Directing, Writing, Writing, Sound, Camera]","[Director, Screenplay, Screenplay, Original Mu..."
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,0.0,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",1917,0.000000,"[1085341, 1195656]","[Yakov Protazanov, Joseph N. Ermolieff]","[0, 2]","[Directing, Production]","[Director, Producer]"


##### Working with "cast" values

Analyzing values

In [125]:
# Total number of values
data_movies_6["cast"].shape[0]

45346

In [126]:
# Checking amount of data type
value_counts_cast = data_movies_6["cast"].value_counts()
print("id movies:", value_counts_cast.index.to_series().apply(type).value_counts())

id movies: cast
<class 'str'>    42998
Name: count, dtype: int64


In [127]:
# Total number of blank values
blanks_cast = utils.validation.count_blanks(data_movies_6, "cast")
print(blanks_cast)

1


In [128]:
# Total number of duplicates
duplicates_cast = utils.validation.count_duplicates(data_movies_6["cast"])
print(duplicates_cast)

cast
[]    2347
Name: count, dtype: int64


In [131]:
# After checking the data types
# Replace "[]" with Empty space
data_movies_6["cast"] = data_movies_6["cast"].replace("[]", "")
utils.validation.replace_nan_with_empty_string(data_movies_6, "cast")

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,vote_count,cast,crew,release_year,return,crew_id,crew_name,crew_gender,crew_department,crew_job
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",1995,12.451801,"[7879, 12891, 7, 12892, 12893, 12894, 12895, 1...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, ...","[Directing, Writing, Writing, Writing, Writing...","[Director, Screenplay, Screenplay, Screenplay,..."
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",1995,4.043035,"[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2]","[Production, Writing, Sound, Directing, Editin...","[Executive Producer, Screenplay, Original Musi..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",1995,0.000000,"[26502, 16837, 16837, 1551320]","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[2, 2, 2, 2]","[Directing, Writing, Writing, Crew]","[Director, Characters, Writer, Sound Recordist]"
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",1995,5.090760,"[2178, 5144, 5144, 21968, 70592, 111118, 11111...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[2, 0, 0, 2, 1, 0, 0, 0, 2, 0]","[Directing, Writing, Production, Production, P...","[Director, Screenplay, Producer, Producer, Pro..."
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",1995,0.000000,"[37, 5506, 17698, 17698, 26160, 56106, 68755]","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[2, 2, 1, 1, 2, 2, 2]","[Sound, Camera, Writing, Production, Writing, ...","[Original Music Composer, Director of Photogra..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,26.0,"[{'cast_id': 1, 'character': 'Sir Robert Hode'...","[{'credit_id': '52fe44439251416c9100a899', 'de...",1991,0.000000,"[17784, 1471628, 39812, 1471628, 18073, 10716,...","[John Irvin, Sam Resnick, John McGrath, Sam Re...","[2, 0, 2, 0, 1, 0, 0, 2, 1]","[Directing, Writing, Writing, Writing, Product...","[Director, Writer, Writer, Story, Producer, Mu..."
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,3.0,"[{'cast_id': 1002, 'character': 'Sister Angela...","[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",2011,0.000000,"[1051381, 1051381, 1728582, 1051381, 1051381, ...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[0, 0, 0, 0, 0, 0]","[Directing, Writing, Art, Sound, Editing, Crew]","[Director, Writer, Production Design, Music, E..."
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,6.0,"[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",2003,0.000000,"[67753, 19713, 549355, 549356, 58818]","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...","[2, 2, 2, 0, 2]","[Directing, Writing, Writing, Sound, Camera]","[Director, Screenplay, Screenplay, Original Mu..."
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,0.0,"[{'cast_id': 2, 'character': '', 'credit_id': ...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",1917,0.000000,"[1085341, 1195656]","[Yakov Protazanov, Joseph N. Ermolieff]","[0, 2]","[Directing, Production]","[Director, Producer]"


In [132]:
utils.validation.get_column_summary(data_movies_6, "cast")

(cast
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

Checking valid python syntax

In [133]:
# Check valid python syntax for cast column
valid_cast = utils.validation.check_valid_expression(data_movies_6, "cast")
valid_cast

{'valid_count': 42997,
 'invalid_count': 2349,
 'nan_count': 0,
 'blank_count': 2349,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [134]:
# Ectracting nested values from cast
column_name = "cast"
keys = ["id", "name", "gender", "character"]
new_columns = ["actor_id", "actor_name", "actor_gender", "actor_character"]

In [135]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,return,crew_id,crew_name,crew_gender,crew_department,crew_job,actor_id,actor_name,actor_gender,actor_character
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,12.451801,"[7879, 12891, 7, 12892, 12893, 12894, 12895, 1...","[John Lasseter, Joss Whedon, Andrew Stanton, J...","[2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, ...","[Directing, Writing, Writing, Writing, Writing...","[Director, Screenplay, Screenplay, Screenplay,...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[Woody (voice), Buzz Lightyear (voice), Mr. Po..."
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,4.043035,"[511, 876, 1729, 4945, 4951, 4952, 8023, 9967,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...","[2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2]","[Production, Writing, Sound, Directing, Editin...","[Executive Producer, Screenplay, Original Musi...","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[Alan Parrish, Samuel Alan Parrish / Van Pelt,..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,0.000000,"[26502, 16837, 16837, 1551320]","[Howard Deutch, Mark Steven Johnson, Mark Stev...","[2, 2, 2, 2]","[Directing, Writing, Writing, Crew]","[Director, Characters, Writer, Sound Recordist]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[2, 2, 1, 1, 1, 2, 2]","[Max Goldman, John Gustafson, Ariel Gustafson,..."
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,5.090760,"[2178, 5144, 5144, 21968, 70592, 111118, 11111...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[2, 0, 0, 2, 1, 0, 0, 0, 2, 0]","[Directing, Writing, Production, Production, P...","[Director, Screenplay, Producer, Producer, Pro...","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[Whitney Houston, Angela Bassett, Loretta Devi...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[Savannah 'Vannah' Jackson, Bernadine 'Bernie'..."
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,0.000000,"[37, 5506, 17698, 17698, 26160, 56106, 68755]","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[2, 2, 1, 1, 2, 2, 2]","[Sound, Camera, Writing, Production, Writing, ...","[Original Music Composer, Director of Photogra...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1]","[George Banks, Nina Banks, Franck Eggelhoffer,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,0.000000,"[17784, 1471628, 39812, 1471628, 18073, 10716,...","[John Irvin, Sam Resnick, John McGrath, Sam Re...","[2, 0, 2, 0, 1, 0, 0, 2, 1]","[Directing, Writing, Writing, Writing, Product...","[Director, Writer, Writer, Story, Producer, Mu...","[29459, 139, 18616, 920, 1924]","[Patrick Bergin, Uma Thurman, David Morrissey,...","[2, 1, 2, 2, 0]","[Sir Robert Hode, Maid Marian, Little John, Si..."
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,0.000000,"[1051381, 1051381, 1728582, 1051381, 1051381, ...","[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[0, 0, 0, 0, 0, 0]","[Directing, Writing, Art, Sound, Editing, Crew]","[Director, Writer, Production Design, Music, E...","[1043186, 111636, 1204271, 278923, 1042953, 57...","[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]","[Sister Angela, Homer, Crazy Woman/Virgin, Ama..."
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,0.000000,"[67753, 19713, 549355, 549356, 58818]","[Mark L. Lester, C. Courtney Joyner, Jeffrey G...","[2, 2, 2, 0, 2]","[Directing, Writing, Writing, Sound, Camera]","[Director, Screenplay, Screenplay, Original Mu...","[23764, 2059, 46277, 1736, 58646, 54649, 55270...","[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]","[Emily Shaw, Det. Mark Winston, Jayne Ferré, A..."
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,0.000000,"[1085341, 1195656]","[Yakov Protazanov, Joseph N. Ermolieff]","[0, 2]","[Directing, Production]","[Director, Producer]","[544742, 1090923, 1136422, 1261758, 29199]","[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[2, 1, 2, 0, 1]","[, , , , ]"


##### Working with "spoken languages" values

Checking valid python syntax

In [136]:
# Check valid python syntax for spoken_languages column
valid_spoken_languages = utils.validation.check_valid_expression(data_movies_5, "spoken_languages")
valid_spoken_languages

{'valid_count': 45346,
 'invalid_count': 0,
 'nan_count': 0,
 'blank_count': 0,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [137]:
# Ectracting nested values from spoken languages
column_name = "spoken_languages"
keys = ["iso_639_1", "name"]
new_columns = ["initial_sp_languages", "sp_languages"]

In [138]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,crew_name,crew_gender,crew_department,crew_job,actor_id,actor_name,actor_gender,actor_character,initial_sp_languages,sp_languages
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,"[John Lasseter, Joss Whedon, Andrew Stanton, J...","[2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, ...","[Directing, Writing, Writing, Writing, Writing...","[Director, Screenplay, Screenplay, Screenplay,...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[Woody (voice), Buzz Lightyear (voice), Mr. Po...",[en],[English]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[Larry J. Franco, Jonathan Hensleigh, James Ho...","[2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2]","[Production, Writing, Sound, Directing, Editin...","[Executive Producer, Screenplay, Original Musi...","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[Alan Parrish, Samuel Alan Parrish / Van Pelt,...","[en, fr]","[English, Français]"
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[Howard Deutch, Mark Steven Johnson, Mark Stev...","[2, 2, 2, 2]","[Directing, Writing, Writing, Crew]","[Director, Characters, Writer, Sound Recordist]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[2, 2, 1, 1, 1, 2, 2]","[Max Goldman, John Gustafson, Ariel Gustafson,...",[en],[English]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...","[2, 0, 0, 2, 1, 0, 0, 0, 2, 0]","[Directing, Writing, Production, Production, P...","[Director, Screenplay, Producer, Producer, Pro...","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[Whitney Houston, Angela Bassett, Loretta Devi...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[Savannah 'Vannah' Jackson, Bernadine 'Bernie'...",[en],[English]
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,"[Alan Silvestri, Elliot Davis, Nancy Meyers, N...","[2, 2, 1, 1, 2, 2, 2]","[Sound, Camera, Writing, Production, Writing, ...","[Original Music Composer, Director of Photogra...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1]","[George Banks, Nina Banks, Franck Eggelhoffer,...",[en],[English]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,"[John Irvin, Sam Resnick, John McGrath, Sam Re...","[2, 0, 2, 0, 1, 0, 0, 2, 1]","[Directing, Writing, Writing, Writing, Product...","[Director, Writer, Writer, Story, Producer, Mu...","[29459, 139, 18616, 920, 1924]","[Patrick Bergin, Uma Thurman, David Morrissey,...","[2, 1, 2, 2, 0]","[Sir Robert Hode, Maid Marian, Little John, Si...",[en],[English]
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,"[Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, La...","[0, 0, 0, 0, 0, 0]","[Directing, Writing, Art, Sound, Editing, Crew]","[Director, Writer, Production Design, Music, E...","[1043186, 111636, 1204271, 278923, 1042953, 57...","[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]","[Sister Angela, Homer, Crazy Woman/Virgin, Ama...",[tl],[]
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,"[Mark L. Lester, C. Courtney Joyner, Jeffrey G...","[2, 2, 2, 0, 2]","[Directing, Writing, Writing, Sound, Camera]","[Director, Screenplay, Screenplay, Original Mu...","[23764, 2059, 46277, 1736, 58646, 54649, 55270...","[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]","[Emily Shaw, Det. Mark Winston, Jayne Ferré, A...",[en],[English]
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,"[Yakov Protazanov, Joseph N. Ermolieff]","[0, 2]","[Directing, Production]","[Director, Producer]","[544742, 1090923, 1136422, 1261758, 29199]","[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[2, 1, 2, 0, 1]","[, , , , ]",[],[]


##### Working with "production countries" values

Checking valid python syntax

In [139]:
# Check valid python syntax for production_counties column
valid_product_countries = utils.validation.check_valid_expression(data_movies_5, "production_countries")
valid_product_countries

{'valid_count': 45346,
 'invalid_count': 0,
 'nan_count': 0,
 'blank_count': 0,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [140]:
# Ectracting nested values from production countries
column_name = "production_countries"
keys = ["iso_3166_1", "name"]
new_columns = ["initial_prod_countries", "prod_countries"]

In [141]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,crew_department,crew_job,actor_id,actor_name,actor_gender,actor_character,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,"[Directing, Writing, Writing, Writing, Writing...","[Director, Screenplay, Screenplay, Screenplay,...","[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[Woody (voice), Buzz Lightyear (voice), Mr. Po...",[en],[English],[US],[United States of America]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[Production, Writing, Sound, Directing, Editin...","[Executive Producer, Screenplay, Original Musi...","[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[Alan Parrish, Samuel Alan Parrish / Van Pelt,...","[en, fr]","[English, Français]",[US],[United States of America]
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[Directing, Writing, Writing, Crew]","[Director, Characters, Writer, Sound Recordist]","[6837, 3151, 13567, 16757, 589, 16523, 7166]","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[2, 2, 1, 1, 1, 2, 2]","[Max Goldman, John Gustafson, Ariel Gustafson,...",[en],[English],[US],[United States of America]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[Directing, Writing, Production, Production, P...","[Director, Screenplay, Producer, Producer, Pro...","[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[Whitney Houston, Angela Bassett, Loretta Devi...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[Savannah 'Vannah' Jackson, Bernadine 'Bernie'...",[en],[English],[US],[United States of America]
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,"[Sound, Camera, Writing, Production, Writing, ...","[Original Music Composer, Director of Photogra...","[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1]","[George Banks, Nina Banks, Franck Eggelhoffer,...",[en],[English],[US],[United States of America]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,"[Directing, Writing, Writing, Writing, Product...","[Director, Writer, Writer, Story, Producer, Mu...","[29459, 139, 18616, 920, 1924]","[Patrick Bergin, Uma Thurman, David Morrissey,...","[2, 1, 2, 2, 0]","[Sir Robert Hode, Maid Marian, Little John, Si...",[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State..."
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,"[Directing, Writing, Art, Sound, Editing, Crew]","[Director, Writer, Production Design, Music, E...","[1043186, 111636, 1204271, 278923, 1042953, 57...","[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]","[Sister Angela, Homer, Crazy Woman/Virgin, Ama...",[tl],[],[PH],[Philippines]
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,"[Directing, Writing, Writing, Sound, Camera]","[Director, Screenplay, Screenplay, Original Mu...","[23764, 2059, 46277, 1736, 58646, 54649, 55270...","[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]","[Emily Shaw, Det. Mark Winston, Jayne Ferré, A...",[en],[English],[US],[United States of America]
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,"[Directing, Production]","[Director, Producer]","[544742, 1090923, 1136422, 1261758, 29199]","[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[2, 1, 2, 0, 1]","[, , , , ]",[],[],[RU],[Russia]


##### Working with "production companies" values

Checking valid python syntax

In [142]:
# Check valid python syntax for production_companies column
valid_product_companies = utils.validation.check_valid_expression(data_movies_5, "production_companies")
valid_product_companies

{'valid_count': 45346,
 'invalid_count': 0,
 'nan_count': 0,
 'blank_count': 0,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [143]:
# Ectracting nested values from production companies
column_name = "production_companies"
keys = ["id", "name"]
new_columns = ["companies_id", "companies_name"]

In [144]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,actor_id,actor_name,actor_gender,actor_character,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[Woody (voice), Buzz Lightyear (voice), Mr. Po...",[en],[English],[US],[United States of America],[3],[Pixar Animation Studios]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[Alan Parrish, Samuel Alan Parrish / Van Pelt,...","[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[6837, 3151, 13567, 16757, 589, 16523, 7166]","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[2, 2, 1, 1, 1, 2, 2]","[Max Goldman, John Gustafson, Ariel Gustafson,...",[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]"
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[Whitney Houston, Angela Bassett, Loretta Devi...","[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[Savannah 'Vannah' Jackson, Bernadine 'Bernie'...",[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation]
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[Steve Martin, Diane Keaton, Martin Short, Kim...","[2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1]","[George Banks, Nina Banks, Franck Eggelhoffer,...",[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,"[29459, 139, 18616, 920, 1924]","[Patrick Bergin, Uma Thurman, David Morrissey,...","[2, 1, 2, 2, 0]","[Sir Robert Hode, Maid Marian, Little John, Si...",[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F..."
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,"[1043186, 111636, 1204271, 278923, 1042953, 57...","[Angel Aquino, Perry Dizon, Hazel Orencio, Joe...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]","[Sister Angela, Homer, Crazy Woman/Virgin, Ama...",[tl],[],[PH],[Philippines],[19653],[Sine Olivia]
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,"[23764, 2059, 46277, 1736, 58646, 54649, 55270...","[Erika Eleniak, Adam Baldwin, Julie du Page, J...","[1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]","[Emily Shaw, Det. Mark Winston, Jayne Ferré, A...",[en],[English],[US],[United States of America],[6165],[American World Pictures]
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,"[544742, 1090923, 1136422, 1261758, 29199]","[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","[2, 1, 2, 0, 1]","[, , , , ]",[],[],[RU],[Russia],[88753],[Yermoliev]


##### Working with "genres" values

Checking valid python syntax

In [145]:
# Check valid python syntax for genres column
valid_genres = utils.validation.check_valid_expression(data_movies_5, "genres")
valid_genres

{'valid_count': 45346,
 'invalid_count': 0,
 'nan_count': 0,
 'blank_count': 0,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [146]:
# Ectracting nested values from genres
column_name = "genres"
keys = ["id", "name"]
new_columns = ["movie_genres_id", "movie_genres"]

In [147]:
utils.validation.extract_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,actor_gender,actor_character,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,"[2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2]","[Woody (voice), Buzz Lightyear (voice), Mr. Po...",[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]"
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, ...","[Alan Parrish, Samuel Alan Parrish / Van Pelt,...","[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]"
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[2, 2, 1, 1, 1, 2, 2]","[Max Goldman, John Gustafson, Ariel Gustafson,...",[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]"
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,"[1, 1, 1, 1, 2, 2, 2, 2, 2, 2]","[Savannah 'Vannah' Jackson, Bernadine 'Bernie'...",[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]"
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,"[2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1]","[George Banks, Nina Banks, Franck Eggelhoffer,...",[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,"[2, 1, 2, 2, 0]","[Sir Robert Hode, Maid Marian, Little John, Si...",[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]"
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,"[1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0]","[Sister Angela, Homer, Crazy Woman/Virgin, Ama...",[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama]
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,"[1, 2, 1, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0]","[Emily Shaw, Det. Mark Winston, Jayne Ferré, A...",[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]"
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,"[2, 1, 2, 0, 1]","[, , , , ]",[],[],[RU],[Russia],[88753],[Yermoliev],[],[]


##### Working with "belongs to collection" values

Checking valid python syntax

In [148]:
# Check valid python syntax for belongs_to_collection column
valid_collection = utils.validation.check_valid_expression(data_movies_5, "belongs_to_collection")
valid_collection

{'valid_count': 4485,
 'invalid_count': 40861,
 'nan_count': 40861,
 'blank_count': 0,
 'error_count': 0,
 'invalid_values': [],
 'error_values': []}

Extracting values

In [149]:
# Ectracting nested values from belongs to collection
column_name = "belongs_to_collection"
keys = ["id", "name"]
new_columns = ["collection_id", "collection_name"]

In [150]:
utils.validation.extract_dict_values(data_movies_6, column_name, keys, new_columns)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres,collection_id,collection_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]",[10194],[Toy Story Collection]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]",,
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]",[119050],[Grumpy Old Men Collection]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]",,
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy],[96871],[Father of the Bride Collection]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]",,
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama],,
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]",,
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],[],[RU],[Russia],[88753],[Yermoliev],[],[],,


#### Check the rest of the columns

##### "original languages" column

In [151]:
utils.validation.get_column_summary(data_movies_6,"original_language")

(original_language
 <class 'str'>      45335
 <class 'float'>       11
 Name: count, dtype: int64,
 'total values:',
 45346)

In [152]:
data_movies_7 = data_movies_6.copy()
dataset = data_movies_7
column = "original_language"

In [153]:
utils.validation.replace_nan_with_empty_string(dataset, column)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres,collection_id,collection_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]",[10194],[Toy Story Collection]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]",,
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]",[119050],[Grumpy Old Men Collection]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]",,
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy],[96871],[Father of the Bride Collection]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]",,
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama],,
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]",,
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],[],[RU],[Russia],[88753],[Yermoliev],[],[],,


In [154]:
utils.validation.get_column_summary(data_movies_7,"original_language")

(original_language
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "overview" column

In [155]:
utils.validation.get_column_summary(data_movies_7,"overview")

(overview
 <class 'str'>      44405
 <class 'float'>      941
 Name: count, dtype: int64,
 'total values:',
 45346)

In [156]:
dataset = data_movies_7
column = "overview"
utils.validation.replace_nan_with_empty_string(dataset, column)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres,collection_id,collection_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]",[10194],[Toy Story Collection]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]",,
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]",[119050],[Grumpy Old Men Collection]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]",,
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy],[96871],[Father of the Bride Collection]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]",,
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama],,
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]",,
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],[],[RU],[Russia],[88753],[Yermoliev],[],[],,


In [157]:
utils.validation.get_column_summary(data_movies_7,"overview")

(overview
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "runtime" column

In [158]:
utils.validation.get_column_summary(data_movies_7,"runtime")

(runtime
 <class 'float'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "status" column

In [159]:
utils.validation.get_column_summary(data_movies_7,"status")

(status
 <class 'str'>      45266
 <class 'float'>       80
 Name: count, dtype: int64,
 'total values:',
 45346)

In [160]:
dataset = data_movies_7
column = "status"
utils.validation.replace_nan_with_empty_string(dataset, column)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres,collection_id,collection_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]",[10194],[Toy Story Collection]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]",,
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]",[119050],[Grumpy Old Men Collection]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]",,
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy],[96871],[Father of the Bride Collection]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]",,
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama],,
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]",,
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],[],[RU],[Russia],[88753],[Yermoliev],[],[],,


In [161]:
utils.validation.get_column_summary(data_movies_7,"status")

(status
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "tagline" column

In [162]:
utils.validation.get_column_summary(data_movies_7,"tagline")

(tagline
 <class 'float'>    24959
 <class 'str'>      20387
 Name: count, dtype: int64,
 'total values:',
 45346)

In [163]:
dataset = data_movies_7
column = "tagline"
utils.validation.replace_nan_with_empty_string(dataset, column)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,initial_sp_languages,sp_languages,initial_prod_countries,prod_countries,companies_id,companies_name,movie_genres_id,movie_genres,collection_id,collection_name
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,[en],[English],[US],[United States of America],[3],[Pixar Animation Studios],"[16, 35, 10751]","[Animation, Comedy, Family]",[10194],[Toy Story Collection]
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,"[en, fr]","[English, Français]",[US],[United States of America],"[559, 2550, 10201]","[TriStar Pictures, Teitler Film, Interscope Co...","[12, 14, 10751]","[Adventure, Fantasy, Family]",,
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],"[6194, 19464]","[Warner Bros., Lancaster Gate]","[10749, 35]","[Romance, Comedy]",[119050],[Grumpy Old Men Collection]
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,[en],[English],[US],[United States of America],[306],[Twentieth Century Fox Film Corporation],"[35, 18, 10749]","[Comedy, Drama, Romance]",,
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",11862,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,...,[en],[English],[US],[United States of America],"[5842, 9195]","[Sandollar Productions, Touchstone Pictures]",[35],[Comedy],[96871],[Father of the Bride Collection]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45427,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",30840,en,"Yet another version of the classic epic, with ...",5.683753,"[{'name': 'Westdeutscher Rundfunk (WDR)', 'id'...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1991-05-13,...,[en],[English],"[CA, DE, GB, US]","[Canada, Germany, United Kingdom, United State...","[7025, 10163, 16323, 38978]","[Westdeutscher Rundfunk (WDR), Working Title F...","[18, 28, 10749]","[Drama, Action, Romance]",,
45429,,0,"[{'id': 18, 'name': 'Drama'}]",111109,tl,An artist struggles to finish his work while a...,0.178241,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,...,[tl],[],[PH],[Philippines],[19653],[Sine Olivia],[18],[Drama],,
45430,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",67758,en,"When one of her hits goes wrong, a professiona...",0.903007,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,...,[en],[English],[US],[United States of America],[6165],[American World Pictures],"[28, 18, 53]","[Action, Drama, Thriller]",,
45431,,0,[],227506,en,"In a small town live two brothers, one a minis...",0.003503,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,...,[],[],[RU],[Russia],[88753],[Yermoliev],[],[],,


In [164]:
utils.validation.get_column_summary(data_movies_7,"tagline")

(tagline
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "title" column

In [165]:
utils.validation.get_column_summary(data_movies_7,"title")

(title
 <class 'str'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "vote average" column

In [166]:
utils.validation.get_column_summary(data_movies_7,"vote_average")

(vote_average
 <class 'float'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

##### "vote count" columm

In [167]:
utils.validation.get_column_summary(data_movies_7,"vote_count")

(vote_count
 <class 'float'>    45346
 Name: count, dtype: int64,
 'total values:',
 45346)

#### Extract necesary columns for API

In [168]:
data_movies_7.columns

Index(['belongs_to_collection', 'budget', 'genres', 'id', 'original_language',
       'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'vote_average',
       'vote_count', 'cast', 'crew', 'release_year', 'return', 'crew_id',
       'crew_name', 'crew_gender', 'crew_department', 'crew_job', 'actor_id',
       'actor_name', 'actor_gender', 'actor_character', 'initial_sp_languages',
       'sp_languages', 'initial_prod_countries', 'prod_countries',
       'companies_id', 'companies_name', 'movie_genres_id', 'movie_genres',
       'collection_id', 'collection_name'],
      dtype='object')

In [169]:
# Necessary data
# id, title, release date, release year, return , revenue, budget, vote count, vote average, popularity
# actor name, crew name, crew job, status

necessary_columns = ["id", "title", "release_date", "release_year", "status", "return", "revenue", "budget", "vote_count", "vote_average", "popularity", "actor_name", "crew_name", "crew_job"]

# Create a new DataFrame with only the selected columns
api_dataframe = data_movies_7[necessary_columns]

# Save the selected DataFrame to a CSV file
api_dataframe.to_csv('../api/data/api_movies_data.csv', index=False)