### Analyze Data 

In [62]:
import psycopg2
import pandas as pd
import sqlalchemy

The PostgreSQL database is hosted on a GCP. Update the pwd variable with password and the host IP. 

In [63]:
try:
    pwd = "xxxxxxxxx"
    conn = sqlalchemy.create_engine("postgresql://postgres:{}@xx.xx.xx.xx:5432/postgres".format(pwd))
    
    conn.raw_connection().set_session(autocommit=True)
    
    cur = conn.raw_connection().cursor()
    
except Exception as e:
    print("Error occurred while connecting to the database!")
    print(e)

Reading the table from the database and storing into a pandas DataFrame. 

In [64]:
names = pd.read_sql("stg_names", conn)
movies = pd.read_sql("stg_movies", conn)

In [65]:
movies.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics,created_date
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0,2022-01-10 23:07:10.042679
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0,2022-01-10 23:07:10.042679
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0,2022-01-10 23:07:10.042679
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0,2022-01-10 23:07:10.042679
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0,2022-01-10 23:07:10.042679


Finding the percentage of null values in each field

In [66]:
s = movies.isnull().sum() / len(movies) * 100
s.where(s>0).dropna().sort_values(ascending=False)

metascore                84.502941
usa_gross_income         82.148972
budget                   72.383670
worlwide_gross_income    63.873974
reviews_from_critics     13.740609
reviews_from_users        8.848640
production_company        5.188981
description               2.463456
writer                    1.830994
language                  0.970241
director                  0.101334
actors                    0.080368
country                   0.074544
dtype: float64

Finding duplicate values in movies dataframe

In [67]:
movies.duplicated(subset=["imdb_title_id"]).sum()

0

In [68]:
movies.dtypes

imdb_title_id                    object
title                            object
original_title                   object
year                             object
date_published                   object
genre                            object
duration                         object
country                          object
language                         object
director                         object
writer                           object
production_company               object
actors                           object
description                      object
avg_vote                         object
votes                            object
budget                           object
usa_gross_income                 object
worlwide_gross_income            object
metascore                        object
reviews_from_users               object
reviews_from_critics             object
created_date             datetime64[ns]
dtype: object

In [69]:
movies[["avg_vote", "votes", "budget", "usa_gross_income", "worlwide_gross_income", "metascore"]].dropna().sample(10)

Unnamed: 0,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore
68698,6.5,301891,$ 44000000,$ 6105175,$ 11782625,52.0
42233,7.5,51433,$ 30000000,$ 42070939,$ 48260279,52.0
40718,6.6,128009,$ 46000000,$ 52802140,$ 101191884,56.0
20844,7.2,32192,$ 8000000,$ 56399659,$ 56399659,61.0
29417,5.4,29771,$ 25000000,$ 4064495,$ 4064495,46.0
35625,6.6,57265,$ 50000000,$ 44737059,$ 50054511,30.0
30485,8.1,713883,$ 15000000,$ 18034458,$ 46735469,71.0
48672,6.7,96790,$ 25000000,$ 47642963,$ 68474305,65.0
56377,6.8,159018,$ 145000000,$ 216391482,$ 746921274,60.0
62528,6.0,8605,$ 1300000,$ 4404,$ 4404,41.0


Finding out if there are any fields that has multiple value in each record. If there are we must remove it and create a separate record for each. 

In [70]:
for col in movies:
    if movies[col].astype(str).str.contains(',').any():
        print(col)

title
original_title
genre
country
language
director
writer
production_company
actors
description


In [71]:
movies['imdb_title_id'].astype(str).str.contains(',', na=False).any()

False

In [72]:
print(movies.loc[movies['title'] != movies['original_title']].shape[0])

23372


In [73]:
movies[movies['title'] != movies['original_title']][['title', 'original_title']]

Unnamed: 0,title,original_title
12,Il calvario di una madre,Ingeborg Holm
18,Amore di madre,"Home, Sweet Home"
19,Lo studente di Praga,Der Student von Prag
39,La madonnina del porto,Tess of the Storm Country
48,Nascita di una nazione,The Birth of a Nation
...,...,...
85791,#IoSonoQui,#jesuislà
85806,Children of the Sea,Kaijû no kodomo
85808,Dreambuilders - La fabbrica dei sogni,Drømmebyggerne
85812,Falling Inn Love - Ristrutturazione con amore,Falling Inn Love


There are records whose title does not match with the original_title field. This is due to the different language for the title. 

Checking out if the date_published and the year data are same or not. Also, we will check if there are any null values. Additionally, we will check if the date_published field has all the dates in the right format. 

In [74]:
movies[movies['date_published'].astype(str).str.len() != len("yyyy-mm-dd")]['date_published'].sample(10)

37362    1999
26594    1992
8965     1957
45386    2004
39908    1976
12483    1966
67411    2014
35867    1959
29426    1995
64959    2012
Name: date_published, dtype: object

In [75]:
movies['date_published'].isnull().sum()

0

In [76]:
date_check = pd.to_datetime(movies['date_published'])
date_check

0       1894-10-09
1       1906-12-26
2       1911-08-19
3       1912-11-13
4       1911-03-06
           ...    
85850   2019-09-27
85851   2020-01-29
85852   2020-02-13
85853   2019-03-08
85854   2020-02-05
Name: date_published, Length: 85855, dtype: datetime64[ns]

In [77]:
movies[date_check.dt.year != movies['year'].astype(int)][['year', 'date_published']]

Unnamed: 0,year,date_published
5,1912,1913
12,1913,1915-10-18
22,1915,1916-05-01
29,1914,1916-09-04
67,1915,1916-06-02
...,...,...
85803,2019,2020-02-12
85804,2019,2020-02-06
85807,2019,2020-10-22
85838,2019,2020-02-14


Checking if the budget, usa_gross_income, worldwide_gross_income can be translated to integer data type.

In [78]:
movies[["budget", "usa_gross_income", "worlwide_gross_income"]].dropna().sample(10)

Unnamed: 0,budget,usa_gross_income,worlwide_gross_income
26544,$ 31000000,$ 9493259,$ 9493259
39297,$ 30000000,$ 80936232,$ 102992536
49379,$ 21000000,$ 28563179,$ 55033767
58055,$ 40000000,$ 20157300,$ 24740061
79643,$ 4000000,$ 1014507,$ 33262366
57510,$ 60000000,$ 105095,$ 22187813
21232,$ 12000000,$ 14000000,$ 14000000
76748,EUR 11270000,$ 77987,$ 2842040
14057,$ 25000000,$ 400881,$ 400881
41164,$ 7000000,$ 196067,$ 282519


Listing out differnt type of currency besides USD.

In [79]:
non_usd_currency = movies[~movies['budget'].astype(str).str.startswith('$')]['budget'].dropna()

In [80]:
list_of_curr = non_usd_currency.astype(str).str.slice(0,3).unique().tolist()

In [81]:
len(list_of_curr)

78

In [82]:
non_usd_currency = movies[~movies['usa_gross_income'].astype(str).str.startswith('$')]['usa_gross_income'].dropna()
list_of_curr.extend(non_usd_currency.astype(str).str.slice(0,3).unique().tolist())

In [83]:
len(list_of_curr)

81

In [84]:
non_usd_currency = movies[~movies['worlwide_gross_income'].astype(str).str.startswith('$')]['worlwide_gross_income'].dropna()
list_of_curr.extend(non_usd_currency.astype(str).str.slice(0,3).unique().tolist())

In [85]:
len(list_of_curr)

85

In [86]:
set_of_curr = set(list_of_curr)
print(len(set_of_curr))

78


In [87]:
names.head()

Unnamed: 0,imdb_name_id,name,birth_name,height,bio,birth_details,date_of_birth,place_of_birth,death_details,date_of_death,place_of_death,reason_of_death,spouses_string,spouses,divorces,spouses_with_children,children,created_date
0,nm0000001,Fred Astaire,Frederic Austerlitz Jr.,177.0,"Fred Astaire was born in Omaha, Nebraska, to J...","May 10, 1899 in Omaha, Nebraska, USA",1899-05-10,"Omaha, Nebraska, USA","June 22, 1987 in Los Angeles, California, USA ...",1987-06-22,"Los Angeles, California, USA",pneumonia,Robyn Smith (27 June 1980 - 22 June 1987) (hi...,2,0,1,2,2022-01-10 23:01:20.370673
1,nm0000002,Lauren Bacall,Betty Joan Perske,174.0,Lauren Bacall was born Betty Joan Perske on Se...,"September 16, 1924 in The Bronx, New York City...",1924-09-16,"The Bronx, New York City, New York, USA","August 12, 2014 in New York City, New York, US...",2014-08-12,"New York City, New York, USA",stroke,Jason Robards (4 July 1961 - 10 September 196...,2,1,2,3,2022-01-10 23:01:20.370673
2,nm0000003,Brigitte Bardot,Brigitte Bardot,166.0,"Brigitte Bardot was born on September 28, 1934...","September 28, 1934 in Paris, France",1934-09-28,"Paris, France",,,,,Bernard d'Ormale (16 August 1992 - present)\n...,4,3,1,1,2022-01-10 23:01:20.370673
3,nm0000004,John Belushi,John Adam Belushi,170.0,"John Belushi was born in Chicago, Illinois, US...","January 24, 1949 in Chicago, Illinois, USA",1949-01-24,"Chicago, Illinois, USA","March 5, 1982 in Hollywood, Los Angeles, Calif...",1982-03-05,"Hollywood, Los Angeles, California, USA",acute cocaine and heroin intoxication,Judith Belushi-Pisano (31 December 1976 - 5 M...,1,0,0,0,2022-01-10 23:01:20.370673
4,nm0000005,Ingmar Bergman,Ernst Ingmar Bergman,179.0,"Ernst Ingmar Bergman was born July 14, 1918, t...","July 14, 1918 in Uppsala, Uppsala län, Sweden",1918-07-14,"Uppsala, Uppsala län, Sweden","July 30, 2007 in Fårö, Gotlands län, Sweden (...",2007-07-30,"Fårö, Gotlands län, Sweden",natural causes,Ingrid Bergman (11 November 1971 - 20 May 199...,5,4,5,8,2022-01-10 23:01:20.370673


Checking for null values in names dataframe. 

In [88]:
null_names_pct = names.isnull().sum() / len(names) * 100
null_names_pct.where(null_names_pct > 0).dropna().sort_values(ascending=False)

reason_of_death    92.377018
place_of_death     87.558825
death_details      86.586386
date_of_death      86.586386
height             84.991518
spouses_string     84.766128
place_of_birth     65.068776
birth_details      62.845098
date_of_birth      62.845098
bio                31.241330
dtype: float64

In [89]:
names.dtypes

imdb_name_id                     object
name                             object
birth_name                       object
height                           object
bio                              object
birth_details                    object
date_of_birth                    object
place_of_birth                   object
death_details                    object
date_of_death                    object
place_of_death                   object
reason_of_death                  object
spouses_string                   object
spouses                          object
divorces                         object
spouses_with_children            object
children                         object
created_date             datetime64[ns]
dtype: object

Finding if there are any duplicated values present. 

In [90]:
print(names.duplicated(subset=['imdb_name_id']).any())

False


Finding fields that has multiple values stored. 

In [91]:
for col in names: 
    if names[col].astype(str).str.contains(',').any():
        print(col)

name
birth_name
bio
birth_details
date_of_birth
place_of_birth
death_details
date_of_death
place_of_death
reason_of_death
spouses_string


In [92]:
names[names['date_of_birth'].astype(str).str.contains(',')][['place_of_birth','date_of_birth','place_of_death','date_of_death']].dropna().sample(20)

Unnamed: 0,place_of_birth,date_of_birth,place_of_death,date_of_death
42464,"Madrid, Spain","1885 in Madrid, Spain","Los Angeles, California, USA",1929-02-08
219053,"Broome, Western Australia, Australia","1948 in Broome, Western Australia, Australia","Broome, Western Australia, Australia",2017-06-26
128615,"Salamina, Greece","1921 in Salamina, Greece","Athens, Greece",2002-01-11
105980,"Almería, Spain","1919 in Almería, Spain","Madrid, Spain",1993-04-24
99553,"Fortaleza, Ceará, Brazil","1934 in Fortaleza, Ceará, Brazil","Rio de Janeiro, Rio de Janeiro, Brazil",2011-01-31
158477,"São José do Rio Preto, São Paulo, Brazil","1966 in São José do Rio Preto, São Paulo, Brazil","São Paulo, São Paulo, Brazil",2010-05-18
13153,"England, UK","1862 in England, UK","Kyoto, Japan",1931-01-03
25033,"Morelia, Michoacan, Mexico","1900 in Morelia, Michoacan, Mexico","Mexico, D.F., Mexico","1978 in Mexico, D.F., Mexico"
17428,"Cincinnati, Ohio, USA","1906 in Cincinnati, Ohio, USA","New York City, New York, USA",1954-09-24
125734,"Boston, Massachusetts, USA","1944 in Boston, Massachusetts, USA","Woodland Hills, California, USA",2014-12-11


Found that the date of birth and date of death has a ',' in the value. This shows that these fields need to be transfromed. 

In [93]:
names_from_comma = set()
for idx, val in movies[~movies['director'].str.title()
                      .isin([x.title().strip() for x in names['name'].to_list()])]['director'].str.split(',').items():
    try:
        names_list = [str.title().strip() for str in val]
        names_from_comma.update(names_list)
    except:
        pass

Finding out list of names that are not present in names dataframe but are present in movies dataframe under actors, director and writer field.

In [94]:
names_diff_dir = names_from_comma - set([s.title().strip() for s in names['name']])

In [95]:
print(names_diff_dir)

{'Ewelina Lukaszewska'}


In [96]:
name_list = names['name'].to_list()
writer_names = movies[~movies['writer'].str.title().isin([x.title().strip() for x in name_list])]['writer'].dropna()
writer_names

2        Urban Gad, Gebhard Schätzler-Perasini
6                    Norbert Falk, Hanns Kräly
7          Henryk Sienkiewicz, Enrico Guazzoni
8             Aristide Demetriade, Petre Liciu
9             James Keane, William Shakespeare
                         ...                  
85847               Michael Wright, Tom George
85851    Alexandre Coquelle, Matthieu Le Naour
85852             Radek Bajgar, Herman Finkers
85853         Vineesh Aaradya, Vineesh Aaradya
85854                     Coral Cruz, Pep Puig
Name: writer, Length: 55242, dtype: object

In [97]:
writer_name_set = set()
for idx, writer in writer_names.str.split(',').items():
    names_list = [x.title().strip() for x in writer]
    writer_name_set.update(names_list)
    
print(len(writer_name_set))

51482


In [98]:
# writer_name_set = set()
# for writer in writer_names:
#     names_list = [x.title().strip() for x in writer.split(',')]
#     writer_name_set.update(names_list)
    
# print(writer_name_set)

In [99]:
name_diff_wir = writer_name_set - set([x.title().strip() for x in name_list])

In [100]:
name_diff = set() 
name_diff.update(names_diff_dir)
name_diff.update(name_diff_wir)
len(name_diff)

87

In [101]:
actor_names = movies[~movies['actors'].str.title().isin([x.title().strip() for x in name_list])]['actors'].dropna().str.strip().to_list()
type(actor_names)

list

In [102]:
actor_names_comma = set()
for actor in actor_names:
    actor_list = [x.title().strip() for x in actor.split(',')]
    actor_names_comma.update(actor_list)
    
len(actor_names_comma)

417223

In [103]:
name_diff_act = actor_names_comma - set([x.title().strip() for x in name_list])
print(len(name_diff_act))
name_diff.update(name_diff_act)
print(len(name_diff))

270445
270520


In [104]:
ratings = pd.read_sql("stg_ratings", conn)

In [105]:
ratings.head()

Unnamed: 0,imdb_title_id,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,...,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes,created_date
0,tt0000009,5.9,154,5.9,6.0,12,4,10,43,28,...,13.0,4.5,4.0,5.7,34.0,6.4,51.0,6.0,70.0,2022-01-10 23:03:36.188320
1,tt0000574,6.1,589,6.3,6.0,57,18,58,137,139,...,23.0,6.6,14.0,6.4,66.0,6.0,96.0,6.2,331.0,2022-01-10 23:03:36.188320
2,tt0001892,5.8,188,6.0,6.0,6,6,17,44,52,...,4.0,6.8,7.0,5.4,32.0,6.2,31.0,5.9,123.0,2022-01-10 23:03:36.188320
3,tt0002101,5.2,446,5.3,5.0,15,8,16,62,98,...,14.0,6.1,21.0,4.9,57.0,5.5,207.0,4.7,105.0,2022-01-10 23:03:36.188320
4,tt0002130,7.0,2237,6.9,7.0,210,225,436,641,344,...,82.0,7.4,77.0,6.9,139.0,7.0,488.0,7.0,1166.0,2022-01-10 23:03:36.188320


Finding null values.

In [106]:
rating_pct = ratings.isnull().sum()/len(ratings) * 100 
rating_pct.where(rating_pct > 0).dropna().sort_values(ascending=False)

females_0age_votes           74.239124
females_0age_avg_vote        74.239124
males_0age_votes             68.072914
males_0age_avg_vote          68.072914
allgenders_0age_votes        61.144954
allgenders_0age_avg_vote     61.144954
females_18age_votes           7.595364
females_18age_avg_vote        7.595364
females_45age_votes           3.258983
females_45age_avg_vote        3.258983
males_18age_avg_vote          1.706365
males_18age_votes             1.706365
females_30age_votes           1.099528
females_30age_avg_vote        1.099528
allgenders_18age_votes        0.822317
allgenders_18age_avg_vote     0.822317
top1000_voters_votes          0.790868
top1000_voters_rating         0.790868
us_voters_votes               0.243434
us_voters_rating              0.243434
males_45age_votes             0.117640
males_45age_avg_vote          0.117640
females_allages_votes         0.094345
females_allages_avg_vote      0.094345
allgenders_45age_votes        0.093180
allgenders_45age_avg_vote

In [107]:
ratings.dtypes

imdb_title_id                        object
weighted_average_vote                object
total_votes                          object
mean_vote                            object
median_vote                          object
votes_10                             object
votes_9                              object
votes_8                              object
votes_7                              object
votes_6                              object
votes_5                              object
votes_4                              object
votes_3                              object
votes_2                              object
votes_1                              object
allgenders_0age_avg_vote             object
allgenders_0age_votes                object
allgenders_18age_avg_vote            object
allgenders_18age_votes               object
allgenders_30age_avg_vote            object
allgenders_30age_votes               object
allgenders_45age_avg_vote            object
allgenders_45age_votes          

Finding duplicate values

In [108]:
ratings.duplicated(subset=['imdb_title_id']).sum()

0

Getting unique country names

In [109]:
movies['country'].unique()

array(['USA', 'Australia', 'Germany, Denmark', ...,
       'Iran, France, Germany, Switzerland, Luxembourg, Lebanon',
       'Spain, Portugal, Argentina', 'India, Netherlands, Singapore'],
      dtype=object)

Checking if all the votes add up to total_votes field

In [110]:
ratings.loc[ratings["total_votes"].astype(int) != 
            ratings["votes_1"].astype(int) + ratings["votes_2"].astype(int) 
            + ratings["votes_3"].astype(int) + ratings["votes_4"].astype(int)
            + ratings["votes_5"].astype(int) + ratings["votes_6"].astype(int)
            + ratings["votes_7"].astype(int) + ratings["votes_8"].astype(int) 
            + ratings["votes_9"].astype(int) + ratings["votes_10"].astype(int)]

Unnamed: 0,imdb_title_id,weighted_average_vote,total_votes,mean_vote,median_vote,votes_10,votes_9,votes_8,votes_7,votes_6,...,females_30age_votes,females_45age_avg_vote,females_45age_votes,top1000_voters_rating,top1000_voters_votes,us_voters_rating,us_voters_votes,non_us_voters_rating,non_us_voters_votes,created_date


In [111]:
title_principals = pd.read_sql('stg_title_principals', conn)

In [112]:
title_principals.head()

Unnamed: 0,imdb_title_id,ordering,imdb_name_id,category,job,characters,created_date
0,tt0000009,1,nm0063086,actress,,"[""Miss Geraldine Holbrook (Miss Jerry)""]",2022-01-10 23:04:46.555751
1,tt0000009,2,nm0183823,actor,,"[""Mr. Hamilton""]",2022-01-10 23:04:46.555751
2,tt0000009,3,nm1309758,actor,,"[""Chauncey Depew - the Director of the New Yor...",2022-01-10 23:04:46.555751
3,tt0000009,4,nm0085156,director,,,2022-01-10 23:04:46.555751
4,tt0000574,1,nm0846887,actress,,"[""Kate Kelly""]",2022-01-10 23:04:46.555751


Checking for null values.

In [113]:
s = title_principals.isnull().sum()/(len(title_principals)) * 100
s.where(s > 0).dropna().sort_values(ascending=False)

job           74.538876
characters    59.206380
dtype: float64

In [114]:
title_principals.loc[title_principals["job"].notnull()]["job"].value_counts()

producer                                                     97837
screenplay                                                   23953
director of photography                                      10251
written by                                                   10234
story                                                         9511
                                                             ...  
novel "Heaven Has No Favorites"                                  1
novel "Towards Zero"                                             1
based on the play Jeanne d'Arc by                                1
from the famous story by                                         1
novel "La dame dans l'auto avec des lunettes et un fusil"        1
Name: job, Length: 8871, dtype: int64

In [115]:
title_principals.loc[title_principals["characters"].notnull()]["characters"].value_counts()

["Self"]                        720
["Anna"]                        539
["Alex"]                        460
["David"]                       455
["Sam"]                         415
                               ... 
["Garabon","Graeme"]              1
["Amy Ryan"]                      1
["Pyotr Nikolayevich"]            1
["Brigadier Shamsher Singh"]      1
["Tadas Blinda"]                  1
Name: characters, Length: 212899, dtype: int64

Checking for the count of duplicate values

In [116]:
title_principals.duplicated(subset=["imdb_title_id", "imdb_name_id", "category", "job", "characters"]).sum()

15

Checking for the list of fields that contains multiple values

In [117]:
for col in title_principals:
    if title_principals[col].astype(str).str.contains(",", na=False).any():
        print(col)

job
characters


In [118]:
title_principals.loc[title_principals["job"].astype(str)
                     .str.contains(",", na=False)]["job"].head()

208     stories The Pit and the Pendulum, The Tell-Tal...
483                       Picturized By, i.e., screenplay
962                stories Raffles, the Amateur Cracksman
964                 play "Raffles, the Amateur Cracksman"
1418                        story "Kitty, Mind Your Feet"
Name: job, dtype: object

In [119]:
ser_characters = title_principals.loc[title_principals["characters"].astype(str)
                                      .str.contains(",", na=False)]["characters"]
ser_characters.sample(5)

97942             ["Mr. Cogez, the Miller"]
495822     ["The Narrator","The Navy Suit"]
759060            ["Peter","serial killer"]
205717      ["Old Devil","Master Ruthless"]
106698    ["Sister Malgorzata","Margareth"]
Name: characters, dtype: object

In [120]:
set(title_principals["imdb_title_id"].tolist()) - set(movies["imdb_title_id"].tolist())

{'tt1860336', 'tt2082513'}

In [121]:
set(title_principals["imdb_name_id"].tolist()) - set(names["imdb_name_id"].tolist())

{'nm7638113'}