In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import categorical_embedder as ce

In [2]:
###############################################################################
##                          Data Preprocessing                               ##                         
###############################################################################

# Carregando o dataset
filmes = pd.read_csv("imdb_movies.csv")

# Removendo a coluna de enumeração
filmes = filmes.drop(filmes.columns[0], axis=1)

# Removendo a coluna de títulos
filmes = filmes.drop(columns=["Series_Title"], errors='ignore')

# Removendo a coluna de overview
filmes = filmes.drop(columns=["Overview"], errors='ignore')

# Adding release year for the movie Apollo 13, which was incorrectly labeled as "PG"
filmes['Released_Year'] = filmes['Released_Year'].replace('PG', 1995)

# Convertendo Released_Year de object para int
filmes['Released_Year'] = filmes['Released_Year'].astype('int32')

# Removendo " min" da coluna Runtime
filmes['Runtime'] = filmes['Runtime'].str.replace(' min','', regex=False)

# Convertendo Runtime de object para int
filmes['Runtime'] = filmes['Runtime'].astype('int32')

# Removendo observações que possuem algum valor vazio
filmes = filmes.dropna()

# Convertendo Gross de object para int e removendo virgulas
filmes['Gross'] = filmes['Gross'].str.replace(',', '', regex=False).astype(float)
filmes[['Gross']] = filmes[['Gross']].astype('int32')

In [3]:
print(filmes.shape)

(713, 13)


In [4]:
# Seperate features from the target
X = filmes.drop(['IMDB_Rating'], axis = 1)
y = filmes['IMDB_Rating']

In [5]:
# ce.get_embedding_info identifies the categorical variables.
# The function returns a dictionary, with tuples of
# (number of categories, embedding size)
# Note: The default is that the size of embedding to be half as the number of categories.
# We can also change the default by handcrafting the dictionary.
embedding_info = ce.get_embedding_info(X, max_dim=2)
embedding_info

{'Certificate': (12, 2),
 'Genre': (172, 2),
 'Director': (402, 2),
 'Star1': (471, 2),
 'Star2': (598, 2),
 'Star3': (625, 2),
 'Star4': (670, 2)}

In [6]:
# ce.get_label_encoded_data integer encodes the categorical variables 
# and prepares it to feed it to neural network.
X_encoded, encoders = ce.get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1972,0,175,105,100.0,100,305,4,237,147,1620367,134966411
1,2008,11,152,17,84.0,59,89,195,0,451,2303232,534858444
2,1974,0,202,105,90.0,100,5,461,496,147,1129952,57300000
3,1957,9,96,105,96.0,338,184,327,382,297,689845,4360000
4,2003,9,201,2,94.0,286,132,566,227,493,1642758,377845905


In [7]:
print(filmes.dtypes)

Released_Year      int32
Certificate       object
Runtime            int32
Genre             object
IMDB_Rating      float64
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross              int32
dtype: object


In [8]:
# fixing the variables' types
X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']] = X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']].astype('int32')

In [9]:
# Show the encoders schema
encoders

{'Certificate': __LabelEncoder__(),
 'Genre': __LabelEncoder__(),
 'Director': __LabelEncoder__(),
 'Star1': __LabelEncoder__(),
 'Star2': __LabelEncoder__(),
 'Star3': __LabelEncoder__(),
 'Star4': __LabelEncoder__()}

In [10]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [11]:
# ce.get_embeddings trains a neural network model, 
# extracts embeddings and returns a dictionary containing the embeddings
embeddings = ce.get_embeddings(
  # Provide the train set
  X_train, y_train, 
  # Provide the embedding info
  categorical_embedding_info = embedding_info, 
  # Our target is a continuous on healthcare expenditure
  is_classification = False,  
  # Specify epochs and batch size 
  epochs = 100, batch_size = 64)

In [12]:
# Shapes of embeddings
print(embeddings['Certificate'].shape)
print(embeddings['Genre'].shape)
print(embeddings['Director'].shape)
print(embeddings['Star1'].shape)
print(embeddings['Star2'].shape)
print(embeddings['Star3'].shape)
print(embeddings['Star4'].shape)

(12, 2)
(172, 2)
(402, 2)
(471, 2)
(598, 2)
(625, 2)
(670, 2)


In [13]:
# If you don't like the dictionary format; 
# we can convert it to dataframe for easy readibility
dfs = ce.get_embeddings_in_dataframe(
  embeddings = embeddings, 
  encoders = encoders)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [14]:
# Embeddings for Certificate
dfs['Certificate']

Unnamed: 0,Certificate_embedding_0,Certificate_embedding_1
A,-0.066196,-0.006344
Approved,0.007004,-0.010761
G,-0.011463,0.026718
GP,0.030247,0.048802
PG,-0.06289,-0.004037
PG-13,0.037824,-0.035583
Passed,-0.057594,0.050534
R,-0.054851,-0.013522
TV-PG,0.010718,0.028047
U,0.003899,0.024966


In [15]:
# Embeddings for Genre
dfs['Genre']

Unnamed: 0,Genre_embedding_0,Genre_embedding_1
"Action, Adventure",0.019626,0.040153
"Action, Adventure, Comedy",-0.004113,0.031713
"Action, Adventure, Drama",0.018900,0.014724
"Action, Adventure, Family",-0.007485,0.018435
"Action, Adventure, Fantasy",-0.027277,-0.016341
...,...,...
"Horror, Thriller",0.014564,-0.004238
"Mystery, Romance, Thriller",0.010984,-0.029266
"Mystery, Sci-Fi, Thriller",0.022988,0.001832
"Mystery, Thriller",0.003673,0.004694


In [16]:
# Embeddings for Director
dfs['Director']

Unnamed: 0,Director_embedding_0,Director_embedding_1
Abdellatif Kechiche,-0.103982,0.018897
Abhishek Kapoor,0.013386,-0.029665
Adam McKay,0.031693,0.029764
Akira Kurosawa,-0.019524,0.021987
Alan J. Pakula,0.029835,-0.029560
...,...,...
Yimou Zhang,0.014566,0.031873
Yoshiaki Kawajiri,0.018218,-0.004942
Yôjirô Takita,-0.016880,0.004028
Zack Snyder,0.018069,0.014046


In [17]:
# Embeddings for Star1
dfs['Star1']

Unnamed: 0,Star1_embedding_0,Star1_embedding_1
Aamir Khan,0.013565,0.029552
Aaron Taylor-Johnson,-0.041444,0.033201
Adam Driver,0.048541,0.007756
Adrian Molina,-0.001065,0.043436
Adrien Brody,0.037642,-0.042272
...,...,...
Zach Galifianakis,-0.022611,0.032381
Zain Al Rafeea,0.041549,-0.001310
Zbigniew Zamachowski,-0.038195,-0.035085
Zooey Deschanel,0.020662,-0.039516


In [18]:
# Embeddings for Star2
dfs['Star2']

Unnamed: 0,Star2_embedding_0,Star2_embedding_1
Adèle Exarchopoulos,-0.107897,-0.027616
Adèle Haenel,0.026335,0.009043
Aidan Gillen,0.020144,0.016151
Akira Terao,-0.008832,-0.025274
Al Pacino,0.025046,0.036333
...,...,...
Zach Galifianakis,-0.027046,-0.033834
Zachary Quinto,0.017971,-0.037913
Zack Gottsagen,-0.028099,-0.023365
Zbigniew Zamachowski,-0.020188,0.021096


In [19]:
# Embeddings for Star3
dfs['Star3']

Unnamed: 0,Star3_embedding_0,Star3_embedding_1
Aaron Eckhart,0.032401,-0.035829
Adel Bencherif,0.040332,0.030287
Adil Hussain,-0.027419,-0.032654
Aitana Sánchez-Gijón,-0.027923,0.048618
Akio Ôtsuka,-0.048915,0.022348
...,...,...
Zachary Levi,0.009999,-0.026059
Zamira Saunders,-0.045556,-0.009791
Zazie Beetz,-0.036997,0.022295
Ziyi Zhang,0.012431,-0.013831


In [20]:
# Embeddings for Star4
dfs['Star4']

Unnamed: 0,Star4_embedding_0,Star4_embedding_1
Abigail Breslin,-0.033020,-0.024023
Adam Baldwin,0.025626,-0.021902
Adrien Brody,0.016626,0.031669
Agnes Moorehead,0.041037,0.042836
Ahna Capri,-0.047504,-0.031627
...,...,...
Zach Grenier,-0.010219,-0.049952
Ziyi Zhang,-0.000539,-0.053101
Zoe Saldana,-0.015875,0.009781
Zoë Kravitz,-0.028258,0.045188


In [21]:
# Include these embeddings in the dataset
movies_embed = ce.fit_transform(
  X, 
  embeddings = embeddings, 
  encoders = encoders, 
  # Remove the original categorical variables
  drop_categorical_vars = True)
movies_embed.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,Released_Year,Runtime,Meta_score,No_of_Votes,Gross,Certificate_embedding_0,Certificate_embedding_1,Genre_embedding_0,Genre_embedding_1,Director_embedding_0,Director_embedding_1,Star1_embedding_0,Star1_embedding_1,Star2_embedding_0,Star2_embedding_1,Star3_embedding_0,Star3_embedding_1,Star4_embedding_0,Star4_embedding_1
0,1972,175,100.0,1620367,134966411,-0.066196,-0.006344,-0.047951,-0.023779,-0.012316,0.033737,-0.023318,-0.028596,0.025046,0.036333,-0.035108,-0.046502,0.020537,-0.041953
1,2008,152,84.0,2303232,534858444,-0.071479,0.006907,0.033201,-0.050324,-0.036907,-0.027483,0.034885,0.013179,0.033447,0.04356,0.032401,-0.035829,0.017372,0.026719
2,1974,202,90.0,1129952,57300000,-0.066196,-0.006344,-0.047951,-0.023779,-0.012316,0.033737,-0.04928,0.015007,0.042775,0.038431,0.044296,0.010789,0.020537,-0.041953
3,1957,96,96.0,689845,4360000,0.003899,0.024966,-0.047951,-0.023779,0.013971,-0.004433,0.062743,0.025548,-0.057803,-0.046061,0.024719,0.038813,-0.023901,0.034952
4,2003,201,94.0,1642758,377845905,0.003899,0.024966,0.0189,0.014724,-0.013382,-0.046693,0.034203,0.028107,0.011014,0.033265,0.040446,0.036982,-0.016073,-0.018696
