In [5]:
import pandas as pd
import ast
import numpy as np

# --- 1. CARREGAR E PREPARAR O DATAFRAME ---
# 'low_memory=False' √© usado para evitar problemas de tipo de dados mistos (DtypeWarning).
print("--- Carregando o Dataset ---")
try:
    df = pd.read_csv('archive/movies_metadata.csv', low_memory=False)
except FileNotFoundError:
    print("ERRO: Arquivo 'archive/movies_metadata.csv' n√£o encontrado. Verifique o caminho.")
    exit()

# Definir todas as colunas JSON que potencialmente existem no arquivo
ALL_JSON_COLS = ['genres', 'belongs_to_collection', 'production_companies', 
                 'production_countries', 'spoken_languages', 'cast', 'crew']

# Lista final das colunas que realmente existem no seu DataFrame
JSON_COLS_TO_PROCESS = [col for col in ALL_JSON_COLS if col in df.columns]
print(f"Colunas JSON que ser√£o processadas: {JSON_COLS_TO_PROCESS}")

# --- 2. FUN√á√ÉO DE DESSERIALIZA√á√ÉO SEGURA ---
def safe_literal_eval(val):
    """Converte strings literais (JSON/Python) em objetos Python, tratando nulos e erros."""
    if pd.isna(val) or val in ['', 'None']:
        return None
    try:
        # ast.literal_eval √© mais seguro que eval() e trata strings literais de Python
        return ast.literal_eval(val)
    except:
        return None

# --- 3. APLICAR DESSERIALIZA√á√ÉO (CRIA AS COLUNAS '_obj') ---
print("--- Aplicando Desserializa√ß√£o ---")
for col in JSON_COLS_TO_PROCESS:
    # 3.1 Cria a coluna tempor√°ria de objeto (ex: df['genres_obj'])
    df[f'{col}_obj'] = df[col].apply(safe_literal_eval)
    
    # 3.2 Preenchimento de nulos para garantir que o objeto seja Lista ou Dicion√°rio
    # (Para evitar erros nos passos de concatena√ß√£o/contagem)
    if col == 'belongs_to_collection':
        # Deve ser Dicion√°rio {} para filmes sem cole√ß√£o
        df[f'{col}_obj'] = df[f'{col}_obj'].apply(lambda x: x if isinstance(x, dict) else {})
    elif isinstance(df[f'{col}_obj'].iloc[0], list) or col not in ['belongs_to_collection']: 
        # Deve ser Lista [] para colunas como genres, companies, etc.
        df[f'{col}_obj'] = df[f'{col}_obj'].apply(lambda x: x if isinstance(x, list) else [])


# --- 4. TRATAMENTO DA COLUNA 'belongs_to_collection' (Dicion√°rio √önico) ---

if 'belongs_to_collection' in JSON_COLS_TO_PROCESS:
    # Extrai o nome da cole√ß√£o e coloca em uma coluna simples
    df['collection_name'] = df['belongs_to_collection_obj'].apply(
        lambda x: x.get('name') if isinstance(x, dict) else None
    )


# --- 5. NORMALIZA√á√ÉO DE LISTAS (Contagem e Lista Simples de Nomes) ---

# Colunas que s√£o listas e que foram criadas no passo 3 (excluindo o dicion√°rio 'belongs_to_collection')
LIST_COLS_OBJ = [col for col in JSON_COLS_TO_PROCESS if col != 'belongs_to_collection']

print("--- Criando Contagens e Listas Simples ---")

for col in LIST_COLS_OBJ:
    obj_col = f'{col}_obj' 
    
    # 5.1. Extrai o n√∫mero de itens na lista (ex: genres_count)
    df[f'{col}_count'] = df[obj_col].apply(lambda x: len(x) if isinstance(x, list) else 0)
    
    # 5.2. Cria uma coluna de texto simples (ex: "Action|Comedy")
    def extract_names(lista):
        if isinstance(lista, list):
            # Extrai o nome de cada dicion√°rio na lista (com tratamento seguro)
            names = [d.get('name') for d in lista if isinstance(d, dict) and d.get('name')]
            return '|'.join(names)
        return ''

    df[f'{col}_list'] = df[obj_col].apply(extract_names)


# --- 6. REMO√á√ÉO FINAL DAS COLUNAS JSON ORIGINAIS E OBJETOS TEMPOR√ÅRIOS ---

# Colunas originais com JSON/Python object:
cols_to_drop = JSON_COLS_TO_PROCESS

# Colunas tempor√°rias (Python objects) criadas na Etapa 3:
cols_to_drop.extend([f'{col}_obj' for col in JSON_COLS_TO_PROCESS])

# Remove as colunas complexas, preservando apenas as colunas limpas (e as contagens/listas)
df = df.drop(columns=cols_to_drop, errors='ignore')


# --- 7. EXIBIR O RESULTADO FINAL ---
print("\n" + "="*50)
print("‚úÖ LIMPEZA E NORMALIZA√á√ÉO B√ÅSICA CONCLU√çDAS.")
print("="*50)

print("\nColunas Restantes no DataFrame Limpo:")
print(df.columns.tolist())

print("\nExemplo de Dados Limpos:")
# Exibe as novas colunas limpas: o nome da cole√ß√£o e as contagens/listas
print(df[['title', 'collection_name', 'genres_count', 'genres_list', 'production_companies_count']].head())

--- Carregando o Dataset ---
Colunas JSON que ser√£o processadas: ['genres', 'belongs_to_collection', 'production_companies', 'production_countries', 'spoken_languages']
--- Aplicando Desserializa√ß√£o ---
--- Criando Contagens e Listas Simples ---

‚úÖ LIMPEZA E NORMALIZA√á√ÉO B√ÅSICA CONCLU√çDAS.

Colunas Restantes no DataFrame Limpo:
['adult', 'budget', 'homepage', 'id', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'release_date', 'revenue', 'runtime', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'collection_name', 'genres_count', 'genres_list', 'production_companies_count', 'production_companies_list', 'production_countries_count', 'production_countries_list', 'spoken_languages_count', 'spoken_languages_list']

Exemplo de Dados Limpos:
                         title                 collection_name  genres_count  \
0                    Toy Story            Toy Story Collection             3   
1                   

In [7]:
df_filtrado = df

In [8]:
df_filtrado.to_csv('Metadados Filme.csv')

In [9]:
df_filtrado.drop(columns=['homepage', 'imdb_id', 'budget', 'poster_path', 'collection_name', 'production_companies_count', 'production_companies_list', 'production_countries_count'], inplace=True)

In [10]:
df_filtrado.rename(columns={'adult': 'Adulto', 'original_language': 'Idioma Original', 'original_title': 'T√≠tulo Original', 'overview': "Descri√ß√£o", 'popularity': 'Popularidade', 'release_date': 'Data de Lan√ßamento', 'revenue': 'Receita', 'runtime': 'Dura√ß√£o', 'status': 'Status', 'production_countries_list': 'Pa√≠s de Origem', 'spoken_languages_count': 'Idiomas falados no filme', 'spoken_languages_list': 'Lista de Idiomas', 'title': 'T√≠tulo em ingl√™s'}, inplace=True)

In [11]:
df_brazil = df_filtrado[df_filtrado['Pa√≠s de Origem'] == 'Brazil']

In [12]:
df_brazil.sort_values('Data de Lan√ßamento',ascending=False)

Unnamed: 0,Adulto,id,Idioma Original,T√≠tulo Original,Descri√ß√£o,Popularidade,Data de Lan√ßamento,Receita,Dura√ß√£o,Status,tagline,T√≠tulo em ingl√™s,video,vote_average,vote_count,genres_count,genres_list,Pa√≠s de Origem,Idiomas falados no filme,Lista de Idiomas
44591,False,448763,pt,Amor.com,It's a love story between a fashion blogger an...,1.957136,2017-06-01,0.0,92.0,Released,,Amor.com,False,6.8,17.0,1,Romance,Brazil,1,Portugu√™s
42827,False,430128,pt,Internet - O Filme,"In a convention of youtubers, the characters e...",2.75527,2017-02-23,0.0,0.0,Released,,Internet - O Filme,False,4.2,52.0,1,Comedy,Brazil,1,Portugu√™s
42149,False,428645,pt,Eu Fico Loko,,1.948382,2017-01-12,0.0,,Released,,Eu Fico Loko,False,8.3,22.0,1,Comedy,Brazil,1,Portugu√™s
42157,False,227932,pt,Minha M√£e √© Uma Pe√ßa 2,"Dona Herm√≠nia is back, but now rich and famous...",4.117801,2016-12-22,0.0,96.0,Released,,My Mom Is a Character 2,False,7.8,100.0,1,Comedy,Brazil,1,Portugu√™s
41720,False,296288,pt,Tamo Junto,"Guy ends his relationship, re-encounter his be...",0.227697,2016-12-08,0.0,100.0,Released,,Tamo Junto,False,3.5,2.0,1,Comedy,Brazil,1,Portugu√™s
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16757,False,59990,pt,O Pagador de Promessas,Z√© is a very poor man from the Brazilian count...,0.904016,1962-04-17,0.0,98.0,Released,The story of a vow that a woman broke and a ma...,The Given Word,False,7.2,9.0,1,Drama,Brazil,1,Portugu√™s
36814,False,28525,pt,Os Cafajestes,This film captures the criminal behavior of tw...,0.107987,1962-03-24,0.0,100.0,Released,,The Unscrupulous Ones,False,7.0,3.0,1,Drama,Brazil,1,Portugu√™s
38027,False,146904,pt,"Rio, Zona Norte",Setting up the gracefully jarring dichotomies ...,0.072383,1957-08-26,0.0,90.0,Released,,"Rio, Zona Norte",False,5.0,1.0,0,,Brazil,1,Portugu√™s
38028,False,146075,pt,"Rio, 40 graus",Banned by Brazil‚Äôs Federal Department of Publi...,0.192623,1955-08-24,0.0,100.0,Released,,Rio 100 Degrees F.,False,3.0,2.0,1,Drama,Brazil,1,Portugu√™s


In [52]:
df_filtrado = df_filtrado[['id', 'T√≠tulo Original', 'T√≠tulo em ingl√™s', 'Pa√≠s de Origem', 'Idioma Original', 'Idiomas falados no filme', 'Lista de Idiomas', 'genres_list', 'vote_count', 'vote_average', 'Popularidade', 'Dura√ß√£o', 'Data de Lan√ßamento', 'Adulto', 'Status']]

In [14]:
df_analise = df_filtrado[['T√≠tulo Original', 'T√≠tulo em ingl√™s', 'Pa√≠s de Origem', 'Idioma Original']].copy() 

# 2. Cria a coluna booleana 'Analise'
# Verifica se os valores s√£o iguais e armazena o resultado (True/False)
df_analise['Analise'] = df_filtrado['T√≠tulo Original'] == df_filtrado['T√≠tulo em ingl√™s']

df_analise.loc[df_analise['Analise'] == False]

Unnamed: 0,T√≠tulo Original,T√≠tulo em ingl√™s,Pa√≠s de Origem,Idioma Original,Analise
28,La Cit√© des Enfants Perdus,The City of Lost Children,France|Germany|Spain,fr,False
29,ÊëáÂïäÊëáÔºåÊëáÂà∞Â§ñÂ©ÜÊ°•,Shanghai Triad,China|France,zh,False
32,"Guillaumet, les ailes du courage",Wings of Courage,France|United States of America,fr,False
57,Il postino,The Postman,Belgium|France|Italy,it,False
58,Le confessionnal,The Confessional,Canada,fr,False
...,...,...,...,...,...
45453,Maa,Mom,India,hi,False
45455,San Michele aveva un gallo,St. Michael Had a Rooster,,it,False
45461,ÿ±⁄Ø ÿÆŸàÿßÿ®,Subdue,Iran,fa,False
45462,Siglo ng Pagluluwal,Century of Birthing,Philippines,tl,False


In [15]:
df_analise

Unnamed: 0,T√≠tulo Original,T√≠tulo em ingl√™s,Pa√≠s de Origem,Idioma Original,Analise
0,Toy Story,Toy Story,United States of America,en,True
1,Jumanji,Jumanji,United States of America,en,True
2,Grumpier Old Men,Grumpier Old Men,United States of America,en,True
3,Waiting to Exhale,Waiting to Exhale,United States of America,en,True
4,Father of the Bride Part II,Father of the Bride Part II,United States of America,en,True
...,...,...,...,...,...
45461,ÿ±⁄Ø ÿÆŸàÿßÿ®,Subdue,Iran,fa,False
45462,Siglo ng Pagluluwal,Century of Birthing,Philippines,tl,False
45463,Betrayal,Betrayal,United States of America,en,True
45464,Satana likuyushchiy,Satan Triumphant,Russia,en,False


Analisando o DataSet Keywords

In [16]:
import pandas as pd
import ast
import numpy as np

# --- 1. CARREGAR O DATAFRAME ---
# Assumindo que o arquivo keywords.csv est√° na mesma pasta raiz
try:
    df_keywords = pd.read_csv('archive/keywords.csv')
except FileNotFoundError:
    print("ERRO: Arquivo 'keywords.csv' n√£o encontrado. Verifique o caminho.")
    exit()

# --- 2. FUN√á√ÉO DE DESSERIALIZA√á√ÉO SEGURA ---
def safe_literal_eval(val):
    """Converte strings literais (como JSON) em objetos Python, tratando nulos e erros."""
    if pd.isna(val) or val in ['', 'None', '[]']:
        return []
    try:
        return ast.literal_eval(val)
    except:
        return []

# --- 3. APLICAR DESSERIALIZA√á√ÉO ---
df_keywords['keywords_obj'] = df_keywords['keywords'].apply(safe_literal_eval)

# --- 4. NORMALIZA√á√ÉO: Achatando a Lista de Dicion√°rios ---

# Achata a coluna 'keywords_obj', criando uma linha para cada palavra-chave por filme.
keywords_normalized = pd.json_normalize(
    df_keywords.to_dict('records'),  # Converte o DF para o formato que json_normalize espera
    record_path='keywords_obj',      # Onde est√° a lista que queremos achatar
    meta=['id'],                     # Mant√©m a coluna 'id' do filme como metadado
    record_prefix='keyword_'         # Prefixo para as colunas extra√≠das (ex: keyword_id, keyword_name)
)

# --- 5. LIMPEZA FINAL ---

# Remove a coluna original e a coluna tempor√°ria de objetos
keywords_normalized = keywords_normalized.drop(columns=['keywords_obj'], errors='ignore')


# --- 6. EXIBIR O RESULTADO ---
print("\n" + "="*50)
print("‚úÖ NORMALIZA√á√ÉO DE KEYWORDS CONCLU√çDA.")
print("="*50)

print("\nExemplo de Dados Normalizados (Palavra-Chave por Linha):")
# Exibe as colunas: ID do filme, ID da keyword e Nome da keyword
print(keywords_normalized[['id', 'keyword_id', 'keyword_name']].head(10))


‚úÖ NORMALIZA√á√ÉO DE KEYWORDS CONCLU√çDA.

Exemplo de Dados Normalizados (Palavra-Chave por Linha):
     id  keyword_id       keyword_name
0   862         931           jealousy
1   862        4290                toy
2   862        5202                boy
3   862        6054         friendship
4   862        9713            friends
5   862        9823            rivalry
6   862      165503      boy next door
7   862      170722            new toy
8   862      187065  toy comes to life
9  8844       10090         board game


In [17]:
keywords_normalized

Unnamed: 0,keyword_id,keyword_name,id
0,931,jealousy,862
1,4290,toy,862
2,5202,boy,862
3,6054,friendship,862
4,9713,friends,862
...,...,...,...
158675,11800,mockumentary,289923
158676,10703,tragic love,439050
158677,2679,artist,111109
158678,14531,play,111109


In [21]:
df_filtrado.head(5)

Unnamed: 0,Adulto,id,Idioma Original,T√≠tulo Original,Descri√ß√£o,Popularidade,Data de Lan√ßamento,Receita,Dura√ß√£o,Status,tagline,T√≠tulo em ingl√™s,video,vote_average,vote_count,genres_count,genres_list,Pa√≠s de Origem,Idiomas falados no filme,Lista de Idiomas
0,False,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,81.0,Released,,Toy Story,False,7.7,5415.0,3,Animation|Comedy|Family,United States of America,1,English
1,False,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,104.0,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,3,Adventure|Fantasy|Family,United States of America,2,English|Fran√ßais
2,False,15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,2,Romance|Comedy,United States of America,1,English
3,False,31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81452156.0,127.0,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,3,Comedy|Drama|Romance,United States of America,1,English
4,False,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76578911.0,106.0,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1,Comedy,United States of America,1,English


In [None]:
keywords_normalized.head(5)

Unnamed: 0,keyword_id,keyword_name,id
0,931,jealousy,862
1,4290,toy,862
2,5202,boy,862
3,6054,friendship,862
4,9713,friends,862


In [None]:
keywords_normalized.groupby(by= ['id'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E9889C5970>

In [24]:
import pandas as pd

# 1. Agrupar as palavras-chave por ID de filme
# Usamos o ID do filme ('id') para agrupar, e a fun√ß√£o .agg() para juntar
# todos os nomes das palavras-chave ('keyword_name') em uma √∫nica string,
# separada por um pipe '|'.

keywords_agregadas = keywords_normalized.groupby('id')['keyword_name'].agg(lambda x: '|'.join(x)).reset_index()

# 2. Renomear a coluna agregada
# A coluna resultante da agrega√ß√£o √© nomeada 'keyword_name', renomeamos para ser descritiva
keywords_agregadas.rename(columns={'keyword_name': 'keywords_list'}, inplace=True)

# 3. Juntar (Merge) ao DataFrame principal (df_filtrado)
# O merge √© feito usando a coluna 'id', que √© comum a ambos os DataFrames.

# A coluna de ID no df_filtrado pode ter sido convertida para string ou float. 
# Para evitar problemas, garantimos que ambas as colunas 'id' sejam tratadas como n√∫meros inteiros,
# caso ainda n√£o estejam (o que √© comum neste dataset de filmes).
try:
    df_filtrado['id'] = pd.to_numeric(df_filtrado['id'], errors='coerce').astype('Int64')
    keywords_agregadas['id'] = pd.to_numeric(keywords_agregadas['id'], errors='coerce').astype('Int64')
except:
    print("Aviso: Falha na convers√£o de ID para inteiro, usando tipo existente.")


df_filtrado = pd.merge(
    df_filtrado, 
    keywords_agregadas, 
    on='id', 
    how='left' # Usamos 'left' para manter todos os filmes em df_filtrado
)

# 4. Visualizar o Resultado
print("Agrega√ß√£o de Keywords Conclu√≠da. Novas colunas:")
print(df_filtrado[['T√≠tulo Original', 'keywords_list']].head())

Agrega√ß√£o de Keywords Conclu√≠da. Novas colunas:
               T√≠tulo Original  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                       keywords_list  
0  jealousy|toy|boy|friendship|friends|rivalry|bo...  
1  board game|disappearance|based on children's b...  
2   fishing|best friend|duringcreditsstinger|old men  
3  based on novel|interracial relationship|single...  
4  baby|midlife crisis|confidence|aging|daughter|...  


In [30]:
df_filtrado.iloc[0]

Adulto                                                                  False
id                                                                        862
Idioma Original                                                            en
T√≠tulo Original                                                     Toy Story
Descri√ß√£o                   Led by Woody, Andy's toys live happily in his ...
Popularidade                                                        21.946943
Data de Lan√ßamento                                                 1995-10-30
Receita                                                           373554033.0
Dura√ß√£o                                                                  81.0
Status                                                               Released
tagline                                                                   NaN
T√≠tulo em ingl√™s                                                    Toy Story
video                                                   

In [35]:
df_filtrado.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Adulto                    45466 non-null  object 
 1   id                        45463 non-null  Int64  
 2   Idioma Original           45455 non-null  object 
 3   T√≠tulo Original           45466 non-null  object 
 4   Descri√ß√£o                 44512 non-null  object 
 5   Popularidade              45461 non-null  object 
 6   Data de Lan√ßamento        45379 non-null  object 
 7   Receita                   45460 non-null  float64
 8   Dura√ß√£o                   45203 non-null  float64
 9   Status                    45379 non-null  object 
 10  tagline                   20412 non-null  object 
 11  T√≠tulo em ingl√™s          45460 non-null  object 
 12  video                     45460 non-null  object 
 13  vote_average              45460 non-null  float64
 14

In [34]:
df_filtrado.sort_values('Data de Lan√ßamento')

Unnamed: 0,Adulto,id,Idioma Original,T√≠tulo Original,Descri√ß√£o,Popularidade,Data de Lan√ßamento,Receita,Dura√ß√£o,Status,...,T√≠tulo em ingl√™s,video,vote_average,vote_count,genres_count,genres_list,Pa√≠s de Origem,Idiomas falados no filme,Lista de Idiomas,keywords_list
19730,- Written by √òrn√•s,,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,1,,,,...,,,,,3,Carousel Productions|Vision View Entertainment...,,0,,
29503,Rune Balot goes to a casino connected to the ...,,68.0,"[{'iso_639_1': 'ja', 'name': 'Êó•Êú¨Ë™û'}]",Released,,12,,,,...,,,,,5,Aniplex|GoHands|BROSTA TV|Mardock Scramble Pro...,,0,,
34940,False,315946,xx,Passage de Venus,Photo sequence of the rare transit of Venus ov...,0.480371,1874-12-09,0.0,1.0,Released,...,Passage of Venus,False,6.0,19.0,1,Documentary,France,1,No Language,silent film|science|astronomy|venus the planet...
34937,False,194079,en,Sallie Gardner at a Gallop,Sallie Gardner at a Gallop was one of the earl...,0.327841,1878-06-14,0.0,1.0,Released,...,Sallie Gardner at a Gallop,False,6.2,25.0,1,Documentary,United States of America,1,No Language,horse|stop motion|animation|black and white|si...
41602,False,426903,en,Buffalo Running,Individual photographs of the running of a buf...,0.229221,1883-11-19,0.0,1.0,Released,...,Buffalo Running,False,5.4,7.0,1,Documentary,United States of America,1,No Language,running|buffalo|photography|black and white|short
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45148,False,438910,ru,Konstruktor krasnogo tsveta -1993,Engineering Red - 1993 Dir: Andrey I. Y. Petr...,0.001586,,0.0,76.0,Released,...,Engineering Red,False,6.0,2.0,0,,,0,,
45203,False,433711,en,All Superheroes Must Die 2: The Last Superhero,"In a no holds barred documentary, acclaimed jo...",0.00022,,0.0,74.0,Released,...,All Superheroes Must Die 2: The Last Superhero,False,4.0,1.0,2,Mystery|Science Fiction,,1,English,
45338,False,335251,en,The Land Where the Blues Began,An exploration of the musical and social origi...,0.0,,0.0,0.0,Released,...,The Land Where the Blues Began,False,0.0,0.0,0,,,0,,
45410,False,449131,ru,Aprel,,0.008903,,0.0,,Released,...,Aprel,False,6.0,1.0,2,Drama|Crime,Russia,0,,


In [60]:
df_filtrado.dropna(inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtrado.dropna(inplace= True)


In [70]:
df_filtrado.loc[(df_filtrado['Pa√≠s de Origem'] == "") & (df_filtrado['Idioma Original'] == "")]

Unnamed: 0,id,T√≠tulo Original,T√≠tulo em ingl√™s,Pa√≠s de Origem,Idioma Original,Idiomas falados no filme,Lista de Idiomas,genres_list,vote_count,vote_average,Popularidade,Dura√ß√£o,Data de Lan√ßamento,Adulto,Status


In [None]:
df_filtrado.loc[(df_filtrado['Pa√≠s de Origem'] == "")].groupby('Idioma Original').count()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E955ADFDD0>

In [76]:
contagem_por_idioma = df_filtrado.loc[df_filtrado['Pa√≠s de Origem'] == ""].groupby('Idioma Original').size()

print("Contagem de Filmes (sem Pa√≠s de Origem) por Idioma:")
print(contagem_por_idioma)

Contagem de Filmes (sem Pa√≠s de Origem) por Idioma:
Idioma Original
ab       1
ar       3
bn       1
cn      10
cs       6
cy       1
da      13
de      67
el      18
en    5124
es      69
et       2
eu       1
fa       9
fi      47
fr     136
fy       1
he       4
hi      63
hu       8
is       3
it     205
ja      34
ka       1
kn       1
ko      16
lv       1
ml       1
mr       6
nb       1
nl      29
no       6
pl      15
pt      16
ru      47
sq       1
sv      33
ta       1
te      10
tr      26
uk       4
ur       1
uz       1
vi       3
xx       2
zh      16
dtype: int64


In [78]:
import pandas as pd

# üó∫Ô∏è Dicion√°rio de Mapeamento Idioma -> Pa√≠s (Simplificado para o Exemplo)
country_map = {
    'en': 'USA',         
    'it': 'Italy',      
    'fr': 'France',     
    'de': 'Germany',   
    'es': 'Spain',     
    'hi': 'India',      
    'ru': 'Russia',    
    'fi': 'Finland',   
    'tr': 'Turkey',    
    'nl': 'Netherlands',
    'ja': 'Japan',      
    'sv': 'Sweden',     
    'pt': 'Brazil',     
    'ko': 'South Korea',
    'zh': 'China',      
    'cn': 'China',      
    'el': 'Greece',     
    'pl': 'Poland',     
    'da': 'Denmark',    
    'ar': 'Egypt',      
    'fa': 'Iran',       
    'hu': 'Hungary',    
    'no': 'Norway',     
    'cs': 'Czechia',    
    'te': 'India',      
    'uk': 'Ukraine',    
    'he': 'Israel',     
    'vi': 'Vietnam',    
    'is': 'Iceland',    
    'et': 'Estonia',    
    'cy': 'UK',         
    'sq': 'Albania',    
    'ml': 'India',      
    'mr': 'India',      
    'bn': 'Bangladesh', 
    'ur': 'Pakistan',   
    'uz': 'Uzbekistan', 
    'ab': 'Georgia',    
    'ka': 'Georgia',    
    'eu': 'Spain',      
    'kn': 'India',      
    'ta': 'India',      
    'fy': 'Netherlands',
    'nb': 'Norway',     
    'xx': 'N/A'         
}

# --- 1. Criar o filtro booleano para as linhas sem Pa√≠s de Origem ---
filtro_sem_pais = df_filtrado['Pa√≠s de Origem'] == ""

# --- 2. Aplicar o mapeamento (map) somente √†s linhas filtradas ---

# 2.1. Seleciona a coluna 'Idioma Original' SOMENTE para as linhas filtradas.
# 2.2. Aplica o dicion√°rio country_map a esses valores de idioma.
# 2.3. O resultado √© a nova S√©rie de pa√≠ses (ex: 'USA', 'France', 'Brazil').
novos_paises = df_filtrado.loc[filtro_sem_pais, 'Idioma Original'].map(country_map)

# --- 3. Atribuir os novos valores de volta √† coluna 'Pa√≠s de Origem' ---

# Usamos .loc novamente para ATRIBUIR os novos valores APENAS √†s linhas filtradas
df_filtrado.loc[filtro_sem_pais, 'Pa√≠s de Origem'] = novos_paises

# 4. Visualiza√ß√£o de uma amostra para confirmar a imputa√ß√£o
print("‚úÖ Imputa√ß√£o de Pa√≠s de Origem com base no Idioma conclu√≠da.")
print("\nExemplo de Filmes Onde o Pa√≠s foi Preenchido:")
# Filtra novamente as linhas que estavam vazias e que agora foram preenchidas
# (Note que algumas podem continuar vazias se o idioma original n√£o estava no dicion√°rio)
print(df_filtrado.loc[df_filtrado['Pa√≠s de Origem'] != ""].head(10))

‚úÖ Imputa√ß√£o de Pa√≠s de Origem com base no Idioma conclu√≠da.

Exemplo de Filmes Onde o Pa√≠s foi Preenchido:
      id              T√≠tulo Original             T√≠tulo em ingl√™s  \
0    862                    Toy Story                    Toy Story   
1   8844                      Jumanji                      Jumanji   
2  15602             Grumpier Old Men             Grumpier Old Men   
3  31357            Waiting to Exhale            Waiting to Exhale   
4  11862  Father of the Bride Part II  Father of the Bride Part II   
5    949                         Heat                         Heat   
6  11860                      Sabrina                      Sabrina   
7  45325                 Tom and Huck                 Tom and Huck   
8   9091                 Sudden Death                 Sudden Death   
9    710                    GoldenEye                    GoldenEye   

                            Pa√≠s de Origem Idioma Original  \
0                 United States of America       

In [79]:
df_filtrado.sort_values('Dura√ß√£o')

Unnamed: 0,id,T√≠tulo Original,T√≠tulo em ingl√™s,Pa√≠s de Origem,Idioma Original,Idiomas falados no filme,Lista de Idiomas,genres_list,vote_count,vote_average,Popularidade,Dura√ß√£o,Data de Lan√ßamento,Adulto,Status
36620,49496,Aashiq Banaya Aapne,Aashiq Banaya Aapne,USA,en,0,,Drama|Horror|Thriller|Science Fiction|Romance|...,6.0,3.3,0.802008,0.0,2005-09-02,False,Released
14402,106537,Michael Jackson: Life of a Superstar,Michael Jackson: Life of a Superstar,USA,en,0,,Documentary|TV Movie,0.0,0.0,0.004706,0.0,2008-12-31,False,Released
45306,317736,Pani z przedszkola,Pani z przedszkola,Poland,pl,1,Polski,Comedy,0.0,0.0,0.171539,0.0,2014-12-25,False,Released
45308,108331,Las buenas hierbas,The Good Herbs,Spain,es,1,Espa√±ol,Drama|Foreign,0.0,0.0,0.567931,0.0,2010-08-20,False,Released
45312,38876,"W≈Çatcy m√≥ch. ƒÜmoki, Czopki i Mondzio≈Çy","W≈Çatcy m√≥ch. ƒÜmoki, Czopki i Mondzio≈Çy",Poland,en,1,Polski,Comedy|Animation,0.0,0.0,0.008064,0.0,2009-02-13,False,Released
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13953,67463,Heimat: Eine deutsche Chronik,Heimat: A Chronicle of Germany,Germany,de,0,,Drama|History,1.0,9.0,0.016204,925.0,1984-09-16,False,Released
13767,45560,Berlin Alexanderplatz,Berlin Alexanderplatz,Germany|Italy,de,1,Deutsch,Drama,5.0,8.4,2.255785,931.0,1980-08-28,False,Released
19965,142051,Jazz,Jazz,USA,en,0,,Documentary,3.0,5.7,1.507756,1140.0,2001-01-09,False,Released
40938,126820,Baseball,Baseball,USA,en,1,English,Documentary,4.0,8.5,0.145073,1140.0,1994-09-18,False,Released


In [81]:
df_filtrado

Unnamed: 0,id,T√≠tulo Original,T√≠tulo em ingl√™s,Pa√≠s de Origem,Idioma Original,Idiomas falados no filme,Lista de Idiomas,genres_list,vote_count,vote_average,Popularidade,Dura√ß√£o,Data de Lan√ßamento,Adulto,Status
0,862,Toy Story,Toy Story,United States of America,en,1,English,Animation|Comedy|Family,5415.0,7.7,21.946943,81.0,1995-10-30,False,Released
1,8844,Jumanji,Jumanji,United States of America,en,2,English|Fran√ßais,Adventure|Fantasy|Family,2413.0,6.9,17.015539,104.0,1995-12-15,False,Released
2,15602,Grumpier Old Men,Grumpier Old Men,United States of America,en,1,English,Romance|Comedy,92.0,6.5,11.7129,101.0,1995-12-22,False,Released
3,31357,Waiting to Exhale,Waiting to Exhale,United States of America,en,1,English,Comedy|Drama|Romance,34.0,6.1,3.859495,127.0,1995-12-22,False,Released
4,11862,Father of the Bride Part II,Father of the Bride Part II,United States of America,en,1,English,Comedy,173.0,5.7,8.387519,106.0,1995-02-10,False,Released
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45460,30840,Robin Hood,Robin Hood,Canada|Germany|United Kingdom|United States of...,en,1,English,Drama|Action|Romance,26.0,5.7,5.683753,104.0,1991-05-13,False,Released
45462,111109,Siglo ng Pagluluwal,Century of Birthing,Philippines,tl,1,,Drama,3.0,9.0,0.178241,360.0,2011-11-17,False,Released
45463,67758,Betrayal,Betrayal,United States of America,en,1,English,Action|Drama|Thriller,6.0,3.8,0.903007,90.0,2003-08-01,False,Released
45464,227506,Satana likuyushchiy,Satan Triumphant,Russia,en,0,,,0.0,0.0,0.003503,87.0,1917-10-21,False,Released


In [80]:
df_filtrado.to_csv('df_consolidado.csv')

In [None]:
def avaliar_duracao(duracao):
    if duracao > 180:
        conclusao = 'Very Long'
    elif 120 <= duracao <= 180:
         conclusao = 'Long'
    elif 60 <= duracao < 120:
         conclusao = 'Medium'
    elif 30 <= duracao < 60:
         conclusao = 'Short'
    elif duracao < 30:
         conclusao = 'Very Short'

    return conclusao

In [84]:
df_filtrado['Categoria de Tempo'] = df_filtrado['Dura√ß√£o'].apply(avaliar_duracao)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtrado['Categoria de Tempo'] = df_filtrado['Dura√ß√£o'].apply(avaliar_duracao)


In [86]:
df_filtrado.to_csv('df_consolidado.csv')