In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import categorical_embedder as ce

In [2]:
###############################################################################
##                          Data Preprocessing                               ##                         
###############################################################################

# Carregando o dataset
filmes = pd.read_csv("imdb_movies.csv")

# Removendo a coluna de enumeração
filmes = filmes.drop(filmes.columns[0], axis=1)

# Removendo a coluna de títulos
filmes = filmes.drop(columns=["Series_Title"], errors='ignore')

# Removendo a coluna de overview
filmes = filmes.drop(columns=["Overview"], errors='ignore')

# Adding release year for the movie Apollo 13, which was incorrectly labeled as "PG"
filmes['Released_Year'] = filmes['Released_Year'].replace('PG', 1995)

# Convertendo Released_Year de object para int
filmes['Released_Year'] = filmes['Released_Year'].astype('int32')

# Removendo " min" da coluna Runtime
filmes['Runtime'] = filmes['Runtime'].str.replace(' min','', regex=False)

# Convertendo Runtime de object para int
filmes['Runtime'] = filmes['Runtime'].astype('int32')

# Removendo observações que possuem algum valor vazio
filmes = filmes.dropna()

# Convertendo Gross de object para int e removendo virgulas
filmes['Gross'] = filmes['Gross'].str.replace(',', '', regex=False).astype(float)
filmes[['Gross']] = filmes[['Gross']].astype('int32')

In [3]:
print(filmes.shape)

(713, 13)


In [4]:
# Seperate features from the target
X = filmes.drop(['IMDB_Rating'], axis = 1)
y = filmes['IMDB_Rating']

In [5]:
# ce.get_embedding_info identifies the categorical variables.
# The function returns a dictionary, with tuples of
# (number of categories, embedding size)
# Note: The default is that the size of embedding to be half as the number of categories.
# We can also change the default by handcrafting the dictionary.
embedding_info = ce.get_embedding_info(X, max_dim=2)
embedding_info

{'Certificate': (12, 2),
 'Genre': (172, 2),
 'Director': (402, 2),
 'Star1': (471, 2),
 'Star2': (598, 2),
 'Star3': (625, 2),
 'Star4': (670, 2)}

In [6]:
# ce.get_label_encoded_data integer encodes the categorical variables 
# and prepares it to feed it to neural network.
X_encoded, encoders = ce.get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1972,0,175,105,100.0,100,305,4,237,147,1620367,134966411
1,2008,11,152,17,84.0,59,89,195,0,451,2303232,534858444
2,1974,0,202,105,90.0,100,5,461,496,147,1129952,57300000
3,1957,9,96,105,96.0,338,184,327,382,297,689845,4360000
4,2003,9,201,2,94.0,286,132,566,227,493,1642758,377845905


In [7]:
print(filmes.dtypes)

Released_Year      int32
Certificate       object
Runtime            int32
Genre             object
IMDB_Rating      float64
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross              int32
dtype: object


In [8]:
# fixing the variables' types
X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']] = X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']].astype('int32')

In [9]:
# Show the encoders schema
encoders

{'Certificate': __LabelEncoder__(),
 'Genre': __LabelEncoder__(),
 'Director': __LabelEncoder__(),
 'Star1': __LabelEncoder__(),
 'Star2': __LabelEncoder__(),
 'Star3': __LabelEncoder__(),
 'Star4': __LabelEncoder__()}

In [10]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [11]:
# ce.get_embeddings trains a neural network model, 
# extracts embeddings and returns a dictionary containing the embeddings
embeddings = ce.get_embeddings(
  # Provide the train set
  X_train, y_train, 
  # Provide the embedding info
  categorical_embedding_info = embedding_info, 
  # Our target is a continuous on healthcare expenditure
  is_classification = False,  
  # Specify epochs and batch size 
  epochs = 100, batch_size = 64)

In [12]:
# Shapes of embeddings
print(embeddings['Certificate'].shape)
print(embeddings['Genre'].shape)
print(embeddings['Director'].shape)
print(embeddings['Star1'].shape)
print(embeddings['Star2'].shape)
print(embeddings['Star3'].shape)
print(embeddings['Star4'].shape)

(12, 2)
(172, 2)
(402, 2)
(471, 2)
(598, 2)
(625, 2)
(670, 2)


In [13]:
# If you don't like the dictionary format; 
# we can convert it to dataframe for easy readibility
dfs = ce.get_embeddings_in_dataframe(
  embeddings = embeddings, 
  encoders = encoders)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [14]:
# Embeddings for Certificate
dfs['Certificate']

Unnamed: 0,Certificate_embedding_0,Certificate_embedding_1
A,-0.005763,-0.014747
Approved,0.011607,0.026137
G,0.026348,-0.036225
GP,-0.041526,0.003082
PG,-0.007111,0.005485
PG-13,0.002239,0.018442
Passed,-0.036147,-0.029816
R,0.020216,-0.034564
TV-PG,0.028098,-0.041443
U,0.051538,0.0274


In [15]:
# Embeddings for Genre
dfs['Genre']

Unnamed: 0,Genre_embedding_0,Genre_embedding_1
"Action, Adventure",0.044860,0.015232
"Action, Adventure, Comedy",-0.052118,-0.025447
"Action, Adventure, Drama",-0.025979,0.019198
"Action, Adventure, Family",0.029135,0.001530
"Action, Adventure, Fantasy",-0.021565,0.046754
...,...,...
"Horror, Thriller",0.039764,0.031390
"Mystery, Romance, Thriller",0.009302,0.046010
"Mystery, Sci-Fi, Thriller",-0.009497,-0.000561
"Mystery, Thriller",0.010581,-0.023662


In [16]:
# Embeddings for Director
dfs['Director']

Unnamed: 0,Director_embedding_0,Director_embedding_1
Abdellatif Kechiche,-0.030311,-0.018196
Abhishek Kapoor,0.012521,-0.012443
Adam McKay,0.017111,-0.024681
Akira Kurosawa,0.023039,-0.026744
Alan J. Pakula,0.015665,-0.033776
...,...,...
Yimou Zhang,0.026712,-0.003596
Yoshiaki Kawajiri,-0.021936,-0.040016
Yôjirô Takita,-0.000049,0.028062
Zack Snyder,0.008077,-0.044882


In [17]:
# Embeddings for Star1
dfs['Star1']

Unnamed: 0,Star1_embedding_0,Star1_embedding_1
Aamir Khan,0.031553,0.019293
Aaron Taylor-Johnson,0.012296,-0.015192
Adam Driver,-0.011836,-0.047283
Adrian Molina,-0.020937,0.010543
Adrien Brody,0.045698,-0.029806
...,...,...
Zach Galifianakis,0.012850,-0.020064
Zain Al Rafeea,-0.036875,-0.048977
Zbigniew Zamachowski,0.014045,0.016784
Zooey Deschanel,0.040601,-0.013200


In [None]:
# Embeddings for Star2
dfs['Star2']

In [None]:
# Embeddings for Star3
dfs['Star3']

In [None]:
# Embeddings for Star4
dfs['Star4']

In [None]:
# Include these embeddings in the dataset
movies_embed = ce.fit_transform(
  X, 
  embeddings = embeddings, 
  encoders = encoders, 
  # Remove the original categorical variables
  drop_categorical_vars = True)
movies_embed.head()