# Importações 

In [101]:
#pip install pandas matplotlib scikit-learn

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer

In [103]:
df =  pd.read_csv("movies.csv")

# Filtros

## Filmes ja lançados (status = Released)
Filmes que ainda não foram lançados não possuem dados importantes para a análise

In [104]:
registros_antes = len(df)
df = df[df['status'] == 'Released']
registros_depois = len(df)
print(f"Registros perdidos com o filtro (status): {registros_antes - registros_depois}")

Registros perdidos com o filtro (status): 5223


## Registros duplicados
Não faz sentido ter o mesmo filme mais de uma vez

In [105]:
registros_antes = len(df)
df = df.drop_duplicates()
registros_depois = len(df)
print(f"Registros perdidos com o filtro (duplicados gerais): {registros_antes - registros_depois}")

Registros perdidos com o filtro (duplicados gerais): 0


### Duplicados por titulo e data de lançamento

In [106]:
registros_antes = len(df)
df = df.drop_duplicates(subset=['title', 'release_date'])
registros_depois = len(df)
print(f"Registros perdidos com o filtro (titulo + data): {registros_antes - registros_depois}")

Registros perdidos com o filtro (titulo + data): 61422


In [107]:
registros_antes = len(df)
df = df.drop_duplicates(subset=['title', 'release_date'])
registros_depois = len(df)
print(f"Registros perdidos com o filtro: {registros_antes - registros_depois}")

Registros perdidos com o filtro: 0


In [108]:
exemplo = df[df['title'].isin(['The Eighth Clause', 'Battle: Freestyle'])].sort_values(by='title')
exemplo.head()

Unnamed: 0,id,title,genres,original_language,overview,popularity,production_companies,release_date,budget,revenue,runtime,status,tagline,vote_average,vote_count,credits,keywords,poster_path,backdrop_path,recommendations
126,785521,Battle: Freestyle,Romance-Drama,no,Amalie and Mikael lead their street dance team...,333.723,Friland Produksjon AS,2022-04-01,0.0,0.0,88.0,Released,,5.1,28.0,Lisa Teige-Fabian Svegaard Tapia-Ellen Dorrit ...,paris france-based on novel or book-norway-dan...,/6D6QumiHEhnpZG12Ibjy2BxA6n4.jpg,/8WpRRiz3qQqzBiifCTm1dW0b6bs.jpg,818750-790525-800407-739993-946726-682344-9532...
13,956101,The Eighth Clause,Thriller,la,Kat and Borja appear to be a perfect couple bu...,2259.303,SDB Films-El Hombre Orquesta,2022-04-29,0.0,0.0,0.0,Released,,4.6,10.0,Maite Perroni-Manuel Vega-Óscar Jaenada-Jessic...,,/8tc8eMFAX2SDC1TRu987qFQy8Cl.jpg,/kLnqNE9Af5QHyvUxw8cDGhF1ilv.jpg,


## Budget > 0
Filmes com budget igual a 0 ou negativo, representam inconssitência com as informações, trazendo erros para a analise

In [109]:
registros_antes = len(df)
df = df[df['budget'] > 0]
registros_depois = len(df)
print(f"Registros perdidos com o filtro (budget > 0): {registros_antes - registros_depois}")

Registros perdidos com o filtro (budget > 0): 622512


## Runtime > 70
Filmes com runtime menores que 70 não são considerados longa-metragem, que é nosso foco

In [110]:
registros_antes = len(df)
df = df[df['runtime'] > 70]
registros_depois = len(df)
print(f"Registros perdidos com o filtro (runtime > 70): {registros_antes - registros_depois}")

Registros perdidos com o filtro (runtime > 70): 12383


## Removendo colunas que não serão utilizadas na análise
- Title -> não usaremos processamento textual
- Overview -> não usaremos processamento textual
- Production Companies -> Vale com o One Hot Encoding? 
- Revenue -> Vazamento de dados.
- Status -> registro unico agr, ent inutil
- Tagline -> não usaremos processamento textual e 85% dos registros são NaN
- vote_avarege -> TARGET 
- vote_count -> vazamento de dados.
- Credits -> Vale com o One Hot Encoding?
- Keywords -> não usaremos processamento textual e 71% dos registros são NaN
- Poster_path e Backdrop_path -> não faz sentido para a análise
- Recommendations -> não faz sentido para a análise, 95% dos valores são NaN, vazamento de dados

In [111]:
df = df.drop(columns=[
    'id','title', 'overview', 'revenue', 'status', 'tagline', 
    'vote_count', 'poster_path', 'backdrop_path', 'recommendations', 'keywords',
    'credits'
])

# Colunas que serão utilizadas na análise
print("Colunas que serão utilizadas na análise:")
print(df.columns)

Colunas que serão utilizadas na análise:
Index(['genres', 'original_language', 'popularity', 'production_companies',
       'release_date', 'budget', 'runtime', 'vote_average'],
      dtype='object')


## Limpando todos os valores nulos

In [112]:
registros_antes = len(df)
df = df.dropna()
registros_depois = len(df)
print(f"Registros perdidos com o filtro: {registros_antes - registros_depois}")

Registros perdidos com o filtro: 3447


# Multi-hot encoding para colunas multivaloradas

In [113]:
def multi_hot_top_n(df, col, sep='-', top_n=20):
    exploded = df[col].str.split(sep).explode().str.strip()
    top = exploded.value_counts().nlargest(top_n).index

    def filter_top(vals):
        return [v.strip() if v.strip() in top else 'Outros' for v in vals.split(sep)]
    
    filtered = df[col].apply(filter_top)
    
    mlb = MultiLabelBinarizer()
    result = pd.DataFrame(mlb.fit_transform(filtered), columns=mlb.classes_, index=df.index)
    return result

## Data de lançamento
- Transformando em duas colunas, uma responsável pelo ano de lançamento e outra pelo mês de lançamento

In [114]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year
df['release_month'] = df['release_date'].dt.month
df['release_decade'] = (df['release_year'] // 10) * 10
df['is_holiday_release'] = df['release_month'].isin([6, 11, 12]).astype(int)
df.drop(columns=['release_date'], inplace=True)

## Ajustando coluna da production_companies

In [115]:
teste = df['production_companies'].apply(lambda x: x.split('-')).copy()
teste = teste.explode().reset_index(drop=True)
production_companies_counts = teste.value_counts() # Apply value_counts directly to the Series
print(production_companies_counts.head(20))
print(len(production_companies_counts))

production_companies
Warner Bros. Pictures        686
Universal Pictures           646
20th Century Fox             504
Columbia Pictures            482
Paramount                    477
Goldwyn                      412
Metro                        410
Mayer                        397
New Line Cinema              253
Canal+                       250
Walt Disney Pictures         205
United Artists               188
Lionsgate                    173
Miramax                      170
Touchstone Pictures          164
StudioCanal                  148
TriStar Pictures             129
Relativity Media             126
DreamWorks Pictures          120
Village Roadshow Pictures    105
Name: count, dtype: int64
18559


In [116]:
companies_encoded = multi_hot_top_n(df, 'production_companies', top_n=10)

In [117]:
df= pd.concat([df.drop(['production_companies'], axis=1),
                companies_encoded], axis=1)

# Tratando genres

In [118]:
genres_encoded = multi_hot_top_n(df, 'genres', top_n=10)
df = pd.concat([df.drop(['genres'], axis=1), genres_encoded], axis=1)

# Log-transformação para colunas contínuas

In [119]:
df['budget_log'] = np.log1p(df['budget'])
df['runtime_log'] = np.log1p(df['runtime'])

df.drop(['budget', 'runtime'], axis=1, inplace=True)

# Normalização para modelos como SVM

In [120]:
from sklearn.preprocessing import StandardScaler

features_num = df.select_dtypes(include='number').drop(columns=['vote_average'])

scaler = StandardScaler()
scaled_array = scaler.fit_transform(features_num)

for i, col in enumerate(features_num.columns):
    df[col] = scaled_array[:, i]

# To CSV
informações do dataset filtrado

In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17330 entries, 0 to 722237
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   original_language      17330 non-null  object 
 1   popularity             17330 non-null  float64
 2   vote_average           17330 non-null  float64
 3   release_year           17330 non-null  float64
 4   release_month          17330 non-null  float64
 5   release_decade         17330 non-null  float64
 6   is_holiday_release     17330 non-null  float64
 7   20th Century Fox       17330 non-null  float64
 8   Canal+                 17330 non-null  float64
 9   Columbia Pictures      17330 non-null  float64
 10  Goldwyn                17330 non-null  float64
 11  Mayer                  17330 non-null  float64
 12  Metro                  17330 non-null  float64
 13  New Line Cinema        17330 non-null  float64
 14  Outros                 17330 non-null  float64
 15  Paramo

In [122]:
df.describe()

Unnamed: 0,popularity,vote_average,release_year,release_month,release_decade,is_holiday_release,20th Century Fox,Canal+,Columbia Pictures,Goldwyn,...,Crime,Drama,Family,Horror,Outros,Romance,Science Fiction,Thriller,budget_log,runtime_log
count,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,...,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0,17330.0
mean,-1.9680350000000002e-17,5.595537,-3.604785e-15,1.11522e-16,5.720423e-15,-5.781104000000001e-17,-3.936071e-17,2.624047e-17,-7.872141e-17,-4.920088e-18,...,-1.476026e-17,2.7060490000000004e-17,2.952053e-17,-8.36415e-17,-8.077145e-17,8.036144000000001e-17,3.936071e-17,4.9200880000000005e-17,-2.361642e-16,-2.820851e-16
std,1.000029,2.148131,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,...,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029
min,-0.1401021,0.0,-4.968881,-1.660868,-4.86866,-0.580593,-0.1730712,-0.1209835,-0.1691412,-0.1554707,...,-0.381322,-0.9068034,-0.2885488,-0.4437408,-0.6873045,-0.4202266,-0.3263431,-0.5479691,-4.38408,-1.885347
25%,-0.1246123,5.1,-0.3168615,-0.7999223,-0.4997295,-0.580593,-0.1730712,-0.1209835,-0.1691412,-0.1554707,...,-0.381322,-0.9068034,-0.2885488,-0.4437408,-0.6873045,-0.4202266,-0.3263431,-0.5479691,-0.4072321,-0.6810562
50%,-0.08766287,6.1,0.2923315,0.06102295,0.0463868,-0.580593,-0.1730712,-0.1209835,-0.1691412,-0.1554707,...,-0.381322,-0.9068034,-0.2885488,-0.4437408,-0.6873045,-0.4202266,-0.3263431,-0.5479691,0.21538,-0.1449157
75%,-0.03143198,6.9,0.6799997,0.9219682,0.5925031,1.722377,-0.1730712,-0.1209835,-0.1691412,-0.1554707,...,-0.381322,1.102775,-0.2885488,-0.4437408,1.454959,-0.4202266,-0.3263431,-0.5479691,0.681013,0.5671356
max,61.14074,10.0,1.123049,1.495932,1.138619,1.722377,5.777969,8.265591,5.912221,6.432079,...,2.622455,1.102775,3.465618,2.253568,1.454959,2.379668,3.06426,1.824921,1.80297,12.85735


In [None]:
#df.to_csv('filmes_luan.csv', index=False)