In [1]:
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import categorical_embedder as ce

In [2]:
###############################################################################
##                          Data Preprocessing                               ##                         
###############################################################################

# Carregando o dataset
filmes = pd.read_csv("imdb_movies.csv")

# Removendo a coluna de enumeração
filmes = filmes.drop(filmes.columns[0], axis=1)

# Removendo a coluna de títulos
filmes = filmes.drop(columns=["Series_Title"], errors='ignore')

# Removendo a coluna de overview
filmes = filmes.drop(columns=["Overview"], errors='ignore')

# Adding release year for the movie Apollo 13, which was incorrectly labeled as "PG"
filmes['Released_Year'] = filmes['Released_Year'].replace('PG', 1995)

# Convertendo Released_Year de object para int
filmes['Released_Year'] = filmes['Released_Year'].astype('int32')

# Removendo " min" da coluna Runtime
filmes['Runtime'] = filmes['Runtime'].str.replace(' min','', regex=False)

# Convertendo Runtime de object para int
filmes['Runtime'] = filmes['Runtime'].astype('int32')

# Removendo observações que possuem algum valor vazio
filmes = filmes.dropna()

# Convertendo Gross de object para int e removendo virgulas
filmes['Gross'] = filmes['Gross'].str.replace(',', '', regex=False).astype(float)
filmes[['Gross']] = filmes[['Gross']].astype('int32')

In [3]:
print(filmes.shape)

(713, 13)


In [4]:
# Seperate features from the target
X = filmes.drop(['IMDB_Rating'], axis = 1)
y = filmes['IMDB_Rating']

In [5]:
# ce.get_embedding_info identifies the categorical variables.
# The function returns a dictionary, with tuples of
# (number of categories, embedding size)
# Note: The default is that the size of embedding to be half as the number of categories.
# We can also change the default by handcrafting the dictionary.
embedding_info = ce.get_embedding_info(X, max_dim=2)
embedding_info

{'Certificate': (12, 2),
 'Genre': (172, 2),
 'Director': (402, 2),
 'Star1': (471, 2),
 'Star2': (598, 2),
 'Star3': (625, 2),
 'Star4': (670, 2)}

In [6]:
# ce.get_label_encoded_data integer encodes the categorical variables 
# and prepares it to feed it to neural network.
X_encoded, encoders = ce.get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,Released_Year,Certificate,Runtime,Genre,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,1972,0,175,105,100.0,100,305,4,237,147,1620367,134966411
1,2008,11,152,17,84.0,59,89,195,0,451,2303232,534858444
2,1974,0,202,105,90.0,100,5,461,496,147,1129952,57300000
3,1957,9,96,105,96.0,338,184,327,382,297,689845,4360000
4,2003,9,201,2,94.0,286,132,566,227,493,1642758,377845905


In [7]:
print(filmes.dtypes)

Released_Year      int32
Certificate       object
Runtime            int32
Genre             object
IMDB_Rating      float64
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross              int32
dtype: object


In [8]:
X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']] = X_encoded[['Certificate', 'Genre', 'Director', 'Star1', 'Star2', 'Star3', 'Star4']].astype('int32')

In [9]:
# Show the encoders schema
encoders

{'Certificate': __LabelEncoder__(),
 'Genre': __LabelEncoder__(),
 'Director': __LabelEncoder__(),
 'Star1': __LabelEncoder__(),
 'Star2': __LabelEncoder__(),
 'Star3': __LabelEncoder__(),
 'Star4': __LabelEncoder__()}

In [10]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y)

In [12]:
# ce.get_embeddings trains a neural network model, 
# extracts embeddings and returns a dictionary containing the embeddings
embeddings = ce.get_embeddings(
  # Provide the train set
  X_train, y_train, 
  # Provide the embedding info
  categorical_embedding_info = embedding_info, 
  # Our target is a continuous on healthcare expenditure
  is_classification = False,  
  # Specify epochs and batch size 
  epochs = 100, batch_size = 64)

In [13]:
# Take a look at the learned embeddings
embeddings

{'Certificate': array([[-0.01168322,  0.01153188],
        [ 0.03398109,  0.0375202 ],
        [ 0.00881301, -0.01228234],
        [ 0.01483977,  0.00203636],
        [ 0.02337763,  0.0136626 ],
        [ 0.03184886,  0.01127376],
        [-0.02717971, -0.00060316],
        [-0.06663465,  0.030811  ],
        [ 0.12823392, -0.04122384],
        [ 0.05621505, -0.02406096],
        [-0.00317019, -0.03296407],
        [-0.03809735, -0.04376633]], dtype=float32),
 'Genre': array([[-3.53397280e-02, -3.67787704e-02],
        [-4.17142361e-02,  2.52861679e-02],
        [-8.18392076e-03, -5.26264459e-02],
        [-3.11977626e-03, -4.01142277e-02],
        [ 4.02357467e-02,  1.90802235e-02],
        [-4.74154577e-03, -1.67453717e-02],
        [ 5.43264262e-02,  1.08744828e-02],
        [ 1.56115647e-02, -1.46925952e-02],
        [-4.39511314e-02,  4.37313039e-03],
        [-9.74355545e-03, -3.68148126e-02],
        [-4.51692194e-02,  4.49456237e-02],
        [-3.15899625e-02,  8.04911926e-03],

In [15]:
# Shapes of embeddings
print(embeddings['Certificate'].shape)
print(embeddings['Genre'].shape)
print(embeddings['Director'].shape)
print(embeddings['Star1'].shape)
print(embeddings['Star2'].shape)
print(embeddings['Star3'].shape)
print(embeddings['Star4'].shape)

(12, 2)
(172, 2)
(402, 2)
(471, 2)
(598, 2)
(625, 2)
(670, 2)


In [16]:
# If you don't like the dictionary format; 
# we can convert it to dataframe for easy readibility
dfs = ce.get_embeddings_in_dataframe(
  embeddings = embeddings, 
  encoders = encoders)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




In [17]:
# Embeddings for Certificate
dfs['Certificate']

Unnamed: 0,Certificate_embedding_0,Certificate_embedding_1
A,-0.011683,0.011532
Approved,0.033981,0.03752
G,0.008813,-0.012282
GP,0.01484,0.002036
PG,0.023378,0.013663
PG-13,0.031849,0.011274
Passed,-0.02718,-0.000603
R,-0.066635,0.030811
TV-PG,0.128234,-0.041224
U,0.056215,-0.024061


In [18]:
# Embeddings for Genre
dfs['Genre']

Unnamed: 0,Genre_embedding_0,Genre_embedding_1
"Action, Adventure",-0.035340,-0.036779
"Action, Adventure, Comedy",-0.041714,0.025286
"Action, Adventure, Drama",-0.008184,-0.052626
"Action, Adventure, Family",-0.003120,-0.040114
"Action, Adventure, Fantasy",0.040236,0.019080
...,...,...
"Horror, Thriller",-0.044237,0.013583
"Mystery, Romance, Thriller",-0.002156,0.034747
"Mystery, Sci-Fi, Thriller",0.015211,-0.032931
"Mystery, Thriller",0.044466,0.002899


In [19]:
# Embeddings for Director
dfs['Director']

Unnamed: 0,Director_embedding_0,Director_embedding_1
Abdellatif Kechiche,0.062080,-0.020307
Abhishek Kapoor,0.048798,0.023349
Adam McKay,0.043696,0.015445
Akira Kurosawa,0.029761,0.025679
Alan J. Pakula,-0.033915,-0.028757
...,...,...
Yimou Zhang,0.038455,-0.010821
Yoshiaki Kawajiri,-0.004409,0.027277
Yôjirô Takita,-0.023794,-0.049014
Zack Snyder,0.008616,0.019507


In [20]:
# Embeddings for Star1
dfs['Star1']

Unnamed: 0,Star1_embedding_0,Star1_embedding_1
Aamir Khan,0.001817,-0.022587
Aaron Taylor-Johnson,0.026787,-0.000301
Adam Driver,0.066226,0.036399
Adrian Molina,-0.007035,-0.012548
Adrien Brody,0.050915,0.042571
...,...,...
Zach Galifianakis,-0.033900,0.046279
Zain Al Rafeea,0.053124,-0.027170
Zbigniew Zamachowski,0.043168,-0.007086
Zooey Deschanel,0.034319,0.043279


In [21]:
# Embeddings for Star2
dfs['Star2']

Unnamed: 0,Star2_embedding_0,Star2_embedding_1
Adèle Exarchopoulos,-0.035818,0.044491
Adèle Haenel,-0.031762,0.003712
Aidan Gillen,0.026908,0.015372
Akira Terao,0.019177,-0.026948
Al Pacino,-0.034912,-0.028706
...,...,...
Zach Galifianakis,0.035121,0.020503
Zachary Quinto,0.018827,0.014971
Zack Gottsagen,-0.036628,-0.024835
Zbigniew Zamachowski,0.010739,0.039335


In [22]:
# Embeddings for Star3
dfs['Star3']

Unnamed: 0,Star3_embedding_0,Star3_embedding_1
Aaron Eckhart,0.055237,0.045083
Adel Bencherif,0.013835,-0.023419
Adil Hussain,-0.033843,0.043203
Aitana Sánchez-Gijón,0.032532,0.020851
Akio Ôtsuka,-0.032060,-0.011171
...,...,...
Zachary Levi,0.019460,0.053438
Zamira Saunders,-0.037754,-0.075471
Zazie Beetz,0.003873,-0.048903
Ziyi Zhang,0.044914,0.013565


In [23]:
# Embeddings for Star4
dfs['Star4']

Unnamed: 0,Star4_embedding_0,Star4_embedding_1
Abigail Breslin,0.048465,0.032460
Adam Baldwin,-0.050857,0.027962
Adrien Brody,-0.013267,0.020293
Agnes Moorehead,0.007057,0.004591
Ahna Capri,0.020232,0.026350
...,...,...
Zach Grenier,0.034983,0.053222
Ziyi Zhang,0.017361,-0.043472
Zoe Saldana,-0.007189,-0.058438
Zoë Kravitz,0.028348,0.015381


In [25]:
# Include these embeddings in the dataset
movies_embed = ce.fit_transform(
  X, 
  embeddings = embeddings, 
  encoders = encoders, 
  # Remove the original categorical variables
  drop_categorical_vars = True)
movies_embed.head()

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




Unnamed: 0,Released_Year,Runtime,Meta_score,No_of_Votes,Gross,Certificate_embedding_0,Certificate_embedding_1,Genre_embedding_0,Genre_embedding_1,Director_embedding_0,Director_embedding_1,Star1_embedding_0,Star1_embedding_1,Star2_embedding_0,Star2_embedding_1,Star3_embedding_0,Star3_embedding_1,Star4_embedding_0,Star4_embedding_1
0,1972,175,100.0,1620367,134966411,-0.011683,0.011532,0.007072,0.025929,-0.027675,-0.014829,0.022552,0.033144,-0.034912,-0.028706,-0.011019,0.024897,-0.050023,0.017888
1,2008,152,84.0,2303232,534858444,-0.038097,-0.043766,0.033611,-0.009199,-0.029031,-0.014087,-0.023811,0.051136,0.03643,-0.011816,0.055237,0.045083,0.018731,-0.033088
2,1974,202,90.0,1129952,57300000,-0.011683,0.011532,0.007072,0.025929,-0.027675,-0.014829,0.028424,0.014098,0.035929,0.002952,0.047186,0.031672,-0.050023,0.017888
3,1957,96,96.0,689845,4360000,0.056215,-0.024061,0.007072,0.025929,0.020711,-0.040781,0.069311,-0.011561,0.040635,0.048134,-0.033445,-0.048813,-0.008237,-0.004212
4,2003,201,94.0,1642758,377845905,0.056215,-0.024061,-0.008184,-0.052626,0.026749,0.003347,-0.0224,-0.015912,0.001906,0.032799,-0.045789,0.005273,-0.005704,-0.044684
