In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, GRU, Input, Dense, Dropout, Attention, TimeDistributed, Embedding, Concatenate, Reshape, Bidirectional
from keras.callbacks import EarlyStopping

In [None]:
df = pd.read_csv("../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

In [None]:
fig, ax1 = plt.subplots(figsize = (24, 9))

sns.countplot(x = df['Origin/Ethnicity'], ax = ax1)

In [None]:
train_df = df[(df['Origin/Ethnicity'] == 'American') | (df['Origin/Ethnicity'] == 'Telugu') | (df['Origin/Ethnicity'] == 'Bollywood') 
              | (df['Origin/Ethnicity'] == 'Tamil') | (df['Origin/Ethnicity'] == 'Malyalam')]

In [None]:
df_red = train_df.loc[:len(train_df)/2, :]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_red['Plot'])
X_seq = tokenizer.texts_to_sequences(df_red['Plot'])
max_sequence_len = max([len(x) for x in X_seq])
X_pad = pad_sequences(X_seq)

In [None]:
max_sequence_len, np.shape(X_pad)

In [None]:
vocab_size_plot = len(tokenizer.word_index) + 1

In [None]:
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(df_red['Title'])
Y_seq = tokenizer2.texts_to_sequences(df_red['Title'])
max_sequence_len_y = max([len(y) for y in Y_seq])
Y_pad = pad_sequences(Y_seq, maxlen = max_sequence_len_y)

In [None]:
max_sequence_len_y, np.shape(Y_pad)

In [None]:
vocab_size_title = len(tokenizer2.word_index) + 1

In [None]:
from keras import backend as K 
K.clear_session() 
latent_dim = 120 

# Encoder 
encoder_inputs = Input(shape=(max_sequence_len,)) 
enc_emb = Embedding(vocab_size_plot, 40,trainable=True)(encoder_inputs) 

#LSTM 1 
encoder_lstm1 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output1, state_h1, state_c1 = encoder_lstm1(enc_emb) 

#LSTM 2 
encoder_lstm2 = LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3=LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

encoder_states = [state_h, state_c]

# Set up the decoder. 
decoder_inputs = Input(shape=(None,)) 
dec_emb_layer = Embedding(vocab_size_title, 20,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb,initial_state=encoder_states) 

#Attention Layer
# attn_layer = Attention()([encoder_outputs, decoder_outputs]) 

# # Concat attention output and decoder LSTM output 
# decoder_concat_input = Concatenate(axis=-1, name='concat_layer')([decoder_outputs, attn_layer])

# #Dense layer
# decoder_dense = TimeDistributed(Dense(vocab_size_title, activation='softmax')) 
# decoder_outputs = decoder_dense(decoder_concat_input) 

decoder_outputs = Dense(vocab_size_title, activation='softmax')(decoder_outputs)

# Define the model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs) 
model.compile(optimizer = 'rmsprop', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
checkpoint_filepath = 'Movie_title_predictor'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_filepath, save_weights_only = False, monitor='loss',
                                                               mode='min', save_best_only=True)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

In [None]:
X_pad.shape, Y_pad[: : -1].shape, (Y_pad.reshape(Y_pad.shape[0],Y_pad.shape[1], 1)).shape

In [None]:
history = model.fit([X_pad, Y_pad[: : -1]], Y_pad.reshape(Y_pad.shape[0],Y_pad.shape[1], 1), epochs = 10, 
                    batch_size = 16, callbacks = [model_checkpoint_callback, es], validation_split = 0.2)

In [None]:
model.save('movie_title_generator')