# Exploratory Data Analysis

## Import

In [2]:
import pandas as pd
#import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns
import sys
import os
#from rapidfuzz import fuzz, process
import unidecode
#import requests
import json

## Raw Data

In [3]:
df_movies = pd.read_csv("../data/raw/TMDB  IMDB Movies Dataset.csv")
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435632 entries, 0 to 435631
Data columns (total 29 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    435632 non-null  int64  
 1   title                 435632 non-null  object 
 2   vote_average          435632 non-null  float64
 3   vote_count            435632 non-null  int64  
 4   status                435632 non-null  object 
 5   release_date          415986 non-null  object 
 6   revenue               435632 non-null  int64  
 7   runtime               435632 non-null  int64  
 8   adult                 435632 non-null  bool   
 9   backdrop_path         184660 non-null  object 
 10  budget                435632 non-null  int64  
 11  homepage              54450 non-null   object 
 12  tconst                435632 non-null  object 
 13  original_language     435632 non-null  object 
 14  original_title        435632 non-null  object 
 15  

## Checking the relevant data

- Deletando colunas que não serão relevantes
- Convertendo release_date em datetime
- Removendo filmes não publicados e filmes adultos.
- Depois de filtrado,  colunas status e adult

In [4]:
df_movies.drop(columns=['backdrop_path','homepage', 'poster_path','original_title','overview','tagline','tconst','keywords'],inplace=True)
df_movies['release_date'] = pd.to_datetime(df_movies['release_date'])
df_movies = df_movies.loc[df_movies['status'] == 'Released']
df_movies = df_movies.loc[df_movies['adult'] == False]
df_movies.drop(columns=['status'],inplace=True)
df_movies.drop(columns=['adult'],inplace=True)

In [5]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 420097 entries, 0 to 435631
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   id                    420097 non-null  int64         
 1   title                 420097 non-null  object        
 2   vote_average          420097 non-null  float64       
 3   vote_count            420097 non-null  int64         
 4   release_date          402318 non-null  datetime64[ns]
 5   revenue               420097 non-null  int64         
 6   runtime               420097 non-null  int64         
 7   budget                420097 non-null  int64         
 8   original_language     420097 non-null  object        
 9   popularity            420097 non-null  float64       
 10  genres                350407 non-null  object        
 11  production_companies  251005 non-null  object        
 12  production_countries  310304 non-null  object        
 13  spok

## Null Values and Filtering

- Existem muitos filmes com dados incompletos
- Queremos apenas os filmes mais conhecidos para nossa amostragem
- Primeiro, irei dropar filmes que possuem mais de 4 campos nulos
- Também irei deletar entradas duplicadas iterando pelo 'id'

In [6]:
df_movies = df_movies.dropna(thresh=df_movies.shape[1] - 4)
df_movies = df_movies.drop_duplicates(subset='id', keep='first')
df_movies['id'].duplicated().sum()
df_movies.isnull().sum()

id                           0
title                        0
vote_average                 0
vote_count                   0
release_date              3207
revenue                      0
runtime                      0
budget                       0
original_language            0
popularity                   0
genres                   38897
production_companies    134186
production_countries     75276
spoken_languages         65773
directors                 6090
writers                  45918
averageRating                0
numVotes                     0
cast                     43722
dtype: int64

- Observacões:
> - Filmes relevantes podem estar com o campo revenue e/ou budget preenchido com 0.
> - É necessários, antes de tratar os campos nulos, filtrar os dados relevantes para a análise.

- Para iniciar a filtragem, irei começar removendo filmes com 0 de popularidade, uma vez que oderão distorcer as metricas avaliadas.
- Também removerei filmes com 0 em 'vote_count'

In [7]:
df_movies = df_movies.loc[df_movies['popularity'] != 0]
df_movies = df_movies.loc[df_movies['vote_count'] != 0]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269390,420218,Notes for a Film on Jazz,8.000,1,1965-12-31,0,35,0,en,0.600,Documentary,,Italy,,Gianni Amico,Gianni Amico,7.4,6,"Annie Ross, Pony Poindexter, Johnny Griffin"
269391,350035,Closer,5.000,1,2014-01-25,0,37,0,sv,2.030,Drama,,Sweden,Swedish,Rasmus Lodenius,Rasmus Lodenius,6.6,40,"Natalie Minnevik, Anastasios Soulis, Hanna Ull..."
269392,417475,Farm Hands,6.000,1,1943-06-19,0,11,0,en,0.600,Comedy,,,,Bert Glazer,"Hal Law, Robert A. McGowan",5.2,118,
269393,349928,You Know What? It's a Secret,5.000,1,1990-07-14,0,109,0,ko,1.351,Drama,,South Korea,Korean,Geum-hwan Jo,Jeong-jin Kim,5.6,14,"Choi Soo-jong, Ha Hee-ra, Lee Kyung-young, Kim..."


- Estou cautelosamente procurando por um minimo de votos (nas metricas do imdb e tmdb). 

In [8]:
#df_movies.loc[((df_movies['vote_count'] < 45) & (df_movies['numVotes'] < 95))].sort_values("revenue", ascending=False)
df_movies = df_movies.loc[~((df_movies['vote_count'] < 40) & (df_movies['numVotes'] < 90))]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269374,69439,Arasatchi,6.000,1,2004-09-22,0,159,0,ta,2.160,"Action, Crime, Drama",Cee TV Entertainment,India,Tamil,N. Maharajan,Gopal Ram,6.4,94,"Arjun Sarja, Lara Dutta, Raghuvaran, Vivek, Ri..."
269385,185158,The Grace Lee Project,1.000,1,2005-03-11,5965,68,0,en,1.045,Documentary,,United States of America,English,Grace Lee,,7.3,147,
269388,185368,Hello There,7.000,1,1995-04-12,0,4,0,en,0.706,,,,,Louis C.K.,"Louis C.K., Ron Lynch",6.7,119,"Ron Lynch, Gilda Conrad, Richard Abernathy, Ca..."
269392,417475,Farm Hands,6.000,1,1943-06-19,0,11,0,en,0.600,Comedy,,,,Bert Glazer,"Hal Law, Robert A. McGowan",5.2,118,


 - Encontrei 40 para vote_count e 90 para o numVotes foi o sweet spot para o valor minimo. 
 - Proximo afunilamento será considerando numVotes e popularity baixo que não possuem informações de revenue e budget.

In [9]:
df_movies = df_movies.loc[~
    (((df_movies['popularity'] < 2.75) & (df_movies['numVotes'] < 120)) &
    ((df_movies['revenue'] == 0) & (df_movies['budget'] == 0)))
    ]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269365,182869,In the Money,7.000,1,1958-02-16,0,61,0,en,1.383,Comedy,Allied Artists,United States of America,English,William Beaudine,"Al Martin, Elwood Ullman",5.5,219,"Huntz Hall, Stanley Clements, Patricia Donahue..."
269366,215302,"What Next, Corporal Hargrove?",2.000,1,1945-11-21,0,95,0,en,2.166,"Comedy, War",Metro-Goldwyn-Mayer,United States of America,English,Richard Thorpe,"Marion Hargrove, Harry Kurnitz",5.6,317,"Robert Walker, Keenan Wynn, Jean Porter, Chill..."
269368,182895,Embraceable You,6.000,1,1948-08-21,0,80,0,en,1.626,"Drama, Romance, Thriller",Warner Bros. Pictures,United States of America,English,Felix Jacoves,"Edna Anhalt, Dietrich V. Hannekin, Aleck Block",6.5,321,"Dane Clark, Geraldine Brooks, S.Z. Sakall, Wal..."
269372,65484,Una notte blu cobalto,10.000,1,2010-01-01,0,0,0,it,0.627,,,,,Daniele Gangemi,"Corrado Fortuna, Daniele Gangemi, Carla Marcia...",4.9,309,"Corrado Fortuna, Regina Orioli, Valentina Carn..."


- Removendo todas as entradas antes de 1990 que não possue informação de revenue e budget

In [10]:
df_movies = df_movies.loc[~
    ((df_movies['release_date'] < '1990-01-01') & ((df_movies['revenue'] == 0) & (df_movies['budget'] == 0)))
    ]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269342,420506,Shourya,6.000,1,2016-03-04,0,122,0,te,1.400,"Thriller, Romance",Suraksh Entertainments,India,Telugu,Dasaradh,"Dasaradh, Gopimohan, Kishore Gopu",6.3,789,"Manchu Manoj, Regina Cassandra, Prakash Raj, B..."
269353,66014,Devil's Vendetta,6.000,1,1991-04-12,0,96,0,zh,3.561,"Horror, Comedy, Fantasy",Rising Sun Films Company,Hong Kong,Cantonese,Hoi-Ching Cheung,"Man-Wah Cheng, James Fung, Chi-Leung Shum, Ant...",6.4,103,"Stanley Sui-Fan Fung, Sharla Cheung, Vivian Ch..."
269358,65944,Beyond Suspicion,10.000,1,1993-11-22,0,91,0,en,1.266,"Thriller, Drama, TV Movie","Von Zerneck Sertner Films, NBC, Patricia K. Me...",United States of America,English,William A. Graham,"Susan Crain Bakos, Karen Clark",5.2,226,"Markie Post, Corbin Bernsen, Don Swayze, Jeann..."
269372,65484,Una notte blu cobalto,10.000,1,2010-01-01,0,0,0,it,0.627,,,,,Daniele Gangemi,"Corrado Fortuna, Daniele Gangemi, Carla Marcia...",4.9,309,"Corrado Fortuna, Regina Orioli, Valentina Carn..."


- Removendo filmes menos populares que não possuem informações de revenue e budget

In [11]:
df_movies = df_movies.loc[~
    (((df_movies['popularity'] < 27.5) & (df_movies['numVotes'] < 435) & (df_movies['vote_count'] < 55)) &
    ((df_movies['revenue'] == 0) & (df_movies['budget'] == 0)))
    ]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269048,215111,Man with a Plan,10.000,1,1996-10-31,0,0,100000,en,0.600,Comedy,,United States of America,English,John O'Brien,John O'Brien,8.2,156,"Fred Tuttle, Bill Blachly"
269106,417649,Bartholomew's Song,2.000,1,2006-04-11,0,11,2500,en,0.711,"Science Fiction, Music",Flagpop,United States of America,English,"Destin Daniel Cretton, Lowell Frank","Destin Daniel Cretton, Lowell Frank",7.2,184,"Brent Simmons, Jon Arnold, Matthew Boselly, Ph..."
269137,215335,Raja Ki Ayegi Baraat,6.000,1,1997-08-18,0,158,0,hi,1.627,"Drama, Romance",,India,Hindi,Ashok Gaekwad,Santosh Saroj,3.6,644,"Rani Mukerji, Shadaab Khan, Divya Dutta, Saeed..."
269342,420506,Shourya,6.000,1,2016-03-04,0,122,0,te,1.400,"Thriller, Romance",Suraksh Entertainments,India,Telugu,Dasaradh,"Dasaradh, Gopimohan, Kishore Gopu",6.3,789,"Manchu Manoj, Regina Cassandra, Prakash Raj, B..."


In [38]:
df_movies = df_movies.loc[~
    (((df_movies['popularity'] < 10) & (df_movies['numVotes'] < 1000) & (df_movies['vote_count'] < 2000)) &
    ((df_movies['revenue'] == 0) & (df_movies['budget'] == 0)))
    ]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268957,273922,Deadline,6.000,1,2013-05-16,0,83,500000,en,0.966,"Thriller, Mystery",Tasara Films,United States of America,English,Chris Tasara,Chris Tasara,6.0,660,"Gary Hudson, Jules Willcox, Paul Francis, Roma..."
269046,273923,Prison of the Psychotic Damned,1.000,1,2006-06-06,0,100,350000,en,0.627,Horror,,,,D.W. Kann,David R. Williams,2.3,198,"Melantha Blackthorne, Demona Bast, Susan Adrie..."
269048,215111,Man with a Plan,10.000,1,1996-10-31,0,0,100000,en,0.600,Comedy,,United States of America,English,John O'Brien,John O'Brien,8.2,156,"Fred Tuttle, Bill Blachly"
269106,417649,Bartholomew's Song,2.000,1,2006-04-11,0,11,2500,en,0.711,"Science Fiction, Music",Flagpop,United States of America,English,"Destin Daniel Cretton, Lowell Frank","Destin Daniel Cretton, Lowell Frank",7.2,184,"Brent Simmons, Jon Arnold, Matthew Boselly, Ph..."


In [None]:
df_movies.loc[df_movies['title'] == 'Star Wars: The Acolyte','release_date'] = '2024-07-16 00:00:00'
df_movies = df_movies[~df_movies['release_date'].isna()]
df_movies.loc[df_movies['title'] == 'Star Wars: The Acolyte']

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
232956,1328024,Star Wars: The Acolyte,5.0,1,2024-07-16,0,0,250,pt,1.4,"Science Fiction, Drama, Action",Lucasfilm Ltd.,,,"Hanelle M. Culpepper, Alex Garcia Lopez, Lesly...","Jasmyne Flournoy, Leslye Headland, George Luca...",4.3,138605,


In [None]:
df_movies = df_movies.loc[~
    (((df_movies['popularity'] < 10) & (df_movies['numVotes'] < 500) & (df_movies['vote_count'] < 2000)) &
    ((df_movies['revenue'] == 0) | (df_movies['budget'] == 0)))
    ]
df_movies = df_movies.loc[~df_movies['genres'].isna()]
df_movies = df_movies.loc[~df_movies['directors'].isna()]
df_movies

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
0,27205,Inception,8.364,34495,2010-07-15,825532764,148,160000000,en,83.952,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili",Christopher Nolan,Christopher Nolan,8.8,2738571,"Leonardo DiCaprio, Joseph Gordon-Levitt, Ken W..."
1,157336,Interstellar,8.417,32571,2014-11-05,701729206,169,165000000,en,140.241,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,Christopher Nolan,"Jonathan Nolan, Christopher Nolan",8.7,2416234,"Matthew McConaughey, Anne Hathaway, Michael Ca..."
2,155,The Dark Knight,8.512,30619,2008-07-16,1004558444,152,185000000,en,130.643,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin",Christopher Nolan,"Jonathan Nolan, Christopher Nolan, David S. Go...",9.1,3083369,"Christian Bale, Heath Ledger, Aaron Eckhart, M..."
3,19995,Avatar,7.573,29815,2009-12-15,2923706026,162,237000000,en,79.932,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish",James Cameron,James Cameron,7.9,1448852,"Sam Worthington, Zoe Saldaña, Sigourney Weaver..."
4,24428,The Avengers,7.710,29166,2012-04-25,1518815515,143,220000000,en,98.082,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian",Joss Whedon,"Joss Whedon, Zak Penn",8.0,1529810,"Robert Downey Jr., Chris Evans, Mark Ruffalo, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267601,331062,Oğlum Bak Git,1.000,1,2012-10-19,0,0,0,tr,1.373,Comedy,,Turkey,Turkish,Kamil Çetin,Seyda Delibasi,2.0,2063,"Yavuz Seçkin, Metin Yıldız, Orhan Aydın, Esra ..."
267656,330755,Sri,6.000,1,2005-12-03,0,135,0,te,3.086,"Action, Romance",Sree Lakshmi Prasanna Pictures,,Telugu,Dasaradh,"Dasaradh, Gopimohan, Kona Venkat, Satyanand",4.2,1155,"Manchu Manoj, Tamannaah Bhatia, Sukanya, Raghu..."
268374,290823,Speed Dragon,10.000,1,2014-05-17,500000,90,0,en,1.920,Drama,URD Pictures,United States of America,English,Dan Frank,"Daniel Celestina, Bai Ling",3.8,683,"Bai Ling, Burgandi Phoenix, Carlos Ramirez, Ma..."
268609,63651,Sendhoorapandi,5.000,1,1993-12-01,0,137,0,ta,1.740,"Family, Drama",,India,Tamil,S.A. Chandrashekhar,"Shoba Chandrasekhar, S.A. Chandrashekhar",7.7,1436,"Vijayakanth, Vijay, Yuvarani, Vijayakumar, Man..."


In [79]:
df_movies.loc[(df_movies['revenue'] == 0) & (df_movies['budget'] == 0)].sort_values("numVotes", ascending=False)

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
29299,900783,The Kashmir Files,7.109,46,2022-03-11,0,170,0,hi,5.805,"History, Drama","Abhishek Agarwal Arts, Zee Studios",India,Hindi,Vivek Agnihotri,"Vivek Agnihotri, Saurabh M. Pandey",8.5,577486,"Mithun Chakraborty, Anupam Kher, Darshan Kumaa..."
184014,1382319,The Vampire Diaries,10.000,2,2009-09-10,0,0,0,en,0.600,"Drama, Science Fiction, Fantasy","CBS Studios, Warner Bros. Entertainment, Outer...",United States of America,English,"Chris Grismer, Joshua Butler, Michael A. Allow...","Julie Plec, Kevin Williamson, L.J. Smith, Bria...",7.7,373506,
944,559969,El Camino: A Breaking Bad Movie,6.948,4454,2019-10-11,0,123,0,en,23.311,"Crime, Drama, Thriller, Action","Sony Pictures Television Studios, Gran Via Pro...",United States of America,English,Vince Gilligan,Vince Gilligan,7.3,326419,"Aaron Paul, Jesse Plemons, Charles Baker, Matt..."
1401,1992,Planet Terror,6.653,3190,2007-04-06,0,105,0,en,27.425,"Horror, Action, Thriller",Dimension Films,United States of America,English,Robert Rodriguez,Robert Rodriguez,7.0,230576,"Rose McGowan, Freddy Rodríguez, Marley Shelton..."
17784,855400,Jai Bhim,7.451,103,2021-11-02,0,164,0,ta,8.841,"Crime, Drama, Mystery",2D Entertainment,India,Tamil,T.J. Gnanavel,"T.J. Gnanavel, Rajendra Sapre",8.6,229944,"Suriya, Lijomol Jose, Baby Joshika Maya, Rajis..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24436,835786,Giant Spider,7.200,62,2021-05-30,0,84,0,zh,10.752,Thriller,,China,Mandarin,Li Yadong,"He Xiao, Zha Xiong, Mo Yan",3.3,71,"Yu Rongguang, Zhou Hengyuan, Liu Weizhou, Su Y..."
21852,991833,Cat Pack: A PAW Patrol Exclusive Event,7.345,74,2022-06-24,0,68,0,en,28.397,"Animation, Family",Spin Master,Canada,English,Charles E. Bastien,,6.2,46,"Kyle Hodgson, Wyatt White, Tianna SwamiNathan,..."
19415,1008779,The Princess,7.406,90,2022-02-01,0,0,0,es,29.011,Action,Cine Latino,Mexico,Spanish,Alonso O. Lara,"Eden Echeverria, Erick Hernandez",8.2,43,"Fernando Ciangherotti, Palmeira Cruz, Alfredo ..."
20426,790409,The Last Heretic,5.193,83,2022-11-09,0,80,0,es,24.414,"Thriller, Horror",Furia Films,Argentina,Spanish,Daniel de la Vega,Sergio Esquenazi,5.9,32,"Germán Palacios, Victoria Almeida, Gloria Carr..."


In [77]:
df_movies.loc[
    ((df_movies['revenue'] == 0) | (df_movies['budget'] == 0)) &
    ~((df_movies['revenue'] == 0) & (df_movies['budget'] == 0))
    ].sort_values("numVotes", ascending=False)

Unnamed: 0,id,title,vote_average,vote_count,release_date,revenue,runtime,budget,original_language,popularity,genres,production_companies,production_countries,spoken_languages,directors,writers,averageRating,numVotes,cast
328,791373,Zack Snyder's Justice League,8.190,9202,2021-03-18,0,242,70000000,en,130.690,"Action, Adventure, Fantasy","Warner Bros. Pictures, The Stone Quarry, Atlas...",United States of America,English,Zack Snyder,"Jerry Siegel, Joe Shuster, Zack Snyder, Chris ...",7.9,464874,"Ben Affleck, Henry Cavill, Gal Gadot, Ray Fish..."
325,405774,Bird Box,6.854,9227,2018-12-13,0,124,19800000,en,27.913,"Horror, Thriller, Drama","Bluegrass Films, Chris Morgan Productions",United States of America,English,Susanne Bier,"Eric Heisserer, Josh Malerman",6.6,412777,"Sandra Bullock, Trevante Rhodes, John Malkovic..."
258,372058,Your Name.,8.514,10303,2016-08-26,357986087,106,0,ja,68.999,"Romance, Animation, Drama","CoMix Wave Films, TOHO, KADOKAWA, East Japan M...",Japan,Japanese,Makoto Shinkai,Makoto Shinkai,8.4,372122,"Ryunosuke Kamiki, Mone Kamishiraishi, Ryo Nari..."
1426,49046,All Quiet on the Western Front,7.749,3127,2022-10-07,0,147,20000000,de,67.015,"Drama, War","RocketScience, Amusement Park Films, Sliding D...","Germany, United Kingdom, United States of America","French, English, German",Edward Berger,"Edward Berger, Lesley Paterson, Ian Stokell, E...",7.8,284231,"Felix Kammerer, Albrecht Schuch, Aaron Hilmer,..."
728,545609,Extraction,7.356,5516,2020-04-24,0,116,65000000,en,46.071,"Action, Thriller","AGBO, Thematic Entertainment, TGIM Films",United States of America,"Bengali, English, Hindi",Sam Hargrave,"Joe Russo, Ande Parks, Anthony Russo, Fernando...",6.8,283384,"Chris Hemsworth, Rudhraksh Jaiswal, Randeep Ho..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23762,965839,Lord of the Streets,5.238,65,2022-04-22,0,85,1000000,en,23.303,Action,Mutiny Films,United States of America,English,Jared Cohn,Jared Cohn,4.3,162,"Treach, Quinton 'Rampage' Jackson, Khalil Roun..."
8878,525686,Mi prima la sexóloga,6.984,306,2016-07-21,0,104,15000,es,13.924,Comedy,,Bolivia,Spanish,Miguel Chavez,Miguel Chavez,3.7,162,"Stephanie Herela, Andrés Salvatierra, Majelo Q..."
24998,928773,Whisper,6.400,60,2022-01-23,0,76,20000,en,33.816,"Horror, Thriller",Greenway Entertainment,United Kingdom,English,Christopher Jolley,"Robert Dunn, Christopher Jolley",3.7,125,"Linda Louise Duan, Penelope Read, Arron Blake,..."
36059,764517,Wheatfield,6.939,33,2022-10-25,0,116,800000,es,23.725,Drama,Home Films,Mexico,Spanish,Anabel Caso,Anabel Caso,6.3,113,"Emilia Berjón, Abril Michel, Alberto Guerra, N..."


In [73]:
#df_movies.loc[(df_movies['revenue'] == 0) | (df_movies['budget'] == 0)]
#df_movies.sort_values("release_date") ~
#df_movies.loc[df_movies['release_date'] < '1930-01-01'].sort_values("release_date")
#df_movies[df_movies['genres'].isna()].sort_values("popularity", ascending=False)
#df_movies.loc[df_movies['revenue'] == 0]
#df_movies.loc[
#    ((df_movies['popularity'] < 200) & (df_movies['numVotes'] < 100000) & (df_movies['vote_count'] < 2000)) &
#    ((df_movies['revenue'] == 0) & (df_movies['budget'] == 0))
#    ]


df_movies.isnull().sum()


id                         0
title                      0
vote_average               0
vote_count                 0
release_date               0
revenue                    0
runtime                    0
budget                     0
original_language          0
popularity                 0
genres                     0
production_companies    3950
production_countries    1487
spoken_languages         430
directors                  0
writers                 1235
averageRating              0
numVotes                   0
cast                     421
dtype: int64