In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('./datasets/wiki_movie_plots_deduped.csv', encoding='utf-8')
display(df.head())
print(df.shape)

Unnamed: 0,Release Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


(25533, 8)


# Text standardization

In [2]:
import swifter
import spacy
nlp = spacy.load('en_core_web_md', disable=["tagger", "parser", "ner"])

In [3]:
def standardize_texts(text:str):
    doc = nlp(text)
    lemmatized_words = []
    for token in doc:
        if not token.is_stop:
            lemmatized_words.append(token.lemma_)
    return ' '.join(lemmatized_words)

In [4]:
df['lemmatized_text'] = df.Plot.swifter.apply(standardize_texts)
df.head(10)

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=25533.0, style=ProgressStyle(descripti…




Unnamed: 0,Release Year,Title,Origin_Ethnicity,Director,Cast,Genre,Wiki Page,Plot,lemmatized_text
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","bartender work saloon , serve drink customer ...."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","moon , paint smile face hang park night . youn..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","film , minute long , compose shot . , girl sit..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"Lasting 61 second consist shot , shoot set woo..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"early know adaptation classic fairytale , film..."
5,1903,Alice in Wonderland,American,Cecil Hepworth,May Clark,unknown,https://en.wikipedia.org/wiki/Alice_in_Wonderl...,"Alice follows a large white rabbit down a ""Rab...","Alice follow large white rabbit "" Rabbit - hol..."
6,1903,The Great Train Robbery,American,Edwin S. Porter,,western,https://en.wikipedia.org/wiki/The_Great_Train_...,The film opens with two bandits breaking into ...,film open bandit break railroad telegraph offi...
7,1904,The Suburbanite,American,Wallace McCutcheon,,comedy,https://en.wikipedia.org/wiki/The_Suburbanite,The film is about a family who move to the sub...,"film family suburb , hope quiet life . Things ..."
8,1905,The Little Train Robbery,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Little_Train...,The opening scene shows the interior of the ro...,open scene show interior robber ' den . wall d...
9,1905,The Night Before Christmas,American,Edwin Stanton Porter,,unknown,https://en.wikipedia.org/wiki/The_Night_Before...,Scenes are introduced using lines of the poem....,"Scenes introduce line poem.[2 ] Santa Claus , ..."


# Text preprocessing

In [5]:
import tensorflow as tf

## Keras tokenizers

In [6]:
tok = tf.keras.preprocessing.text.Tokenizer()
tok.fit_on_texts(df.lemmatized_text.to_list())
tok.word_index.keys()



# Use Glove Word Embedding matrix

In [7]:
import numpy as np
from typing import Tuple, Dict

def load_glove_vectors(glove_path:str,
                       word_index: dict
                       ) -> Tuple[Dict, np.ndarray, int]:
    '''
    

    Parameters
    ----------
    glove_path : str
        The path of glove vector file.
    word_index : dict
        The word index of words in vocabulary.
        Format: Ideally Keras Tokenizer generates word indices by ranking them
        in descending order of counts in corpus
        {'word1':rank_word1,
         'word2':rank_word2
         }

    Returns
    -------
    embeddings_index : Dict
        Dictionary in the form {'word1':np.ndarray(embedding_vector1)}.
    embedding_matrix : np.ndarray
        Numpy array with shape (v+1,e)
        where v = number of words in vocabulary
        e = embedding dimension
    embedding_dim : int
        The embedding dimension of the word vectors

    '''
    # to return embeddings_index, embedding_matrix
    embeddings_index = {}
    f = open(glove_path, encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    embedding_dim = list(embeddings_index.values())[0].shape[0]
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    
    print('Info: Dimensionality of word vector: {}'.format(embedding_dim))
    return embeddings_index, embedding_matrix, embedding_dim

In [9]:
glove_path = r'C:\Users\sandipto.sanyal\Documents\AMPBA others\glove.6B.50d.txt'
word_index = tok.word_index
embeddings_index, embedding_matrix, embedding_dim = load_glove_vectors(glove_path, word_index)
print(embedding_matrix[2])
print(embeddings_index.get('the'))

Info: Dimensionality of word vector: 50
[ 1.08019996  0.085736    0.28167    -0.19272999  0.93193001 -0.10568
 -0.80243999  0.42669001  0.22081     0.12251    -0.31736001  0.55053002
 -0.061096    0.13448     0.74414003  0.55046999  0.71789998 -0.14793999
  0.47325    -0.90377998 -0.14574     0.52746999  0.057465    0.63846999
  0.65556002 -1.59249997 -0.87665999 -0.081343    0.87799001 -0.68603998
  3.12809992  0.30949    -0.42348    -0.74618     0.24501     0.30605
 -0.30989999  0.16848999  0.18700001 -0.60935998 -0.058143   -0.034153
  0.34744999  0.49639001  0.53131002  0.11259     0.030348    0.062295
  0.16278     0.17376   ]
[ 4.1800e-01  2.4968e-01 -4.1242e-01  1.2170e-01  3.4527e-01 -4.4457e-02
 -4.9688e-01 -1.7862e-01 -6.6023e-04 -6.5660e-01  2.7843e-01 -1.4767e-01
 -5.5677e-01  1.4658e-01 -9.5095e-03  1.1658e-02  1.0204e-01 -1.2792e-01
 -8.4430e-01 -1.2181e-01 -1.6801e-02 -3.3279e-01 -1.5520e-01 -2.3131e-01
 -1.9181e-01 -1.8823e+00 -7.6746e-01  9.9051e-02 -4.2125e-01 -1.9526

# Create the X matrix

In [11]:
max_sequence_length = 50
sequences = tok.texts_to_sequences(df.Plot.to_list())
print('Some sequences:::')
print(sequences[0:2])
X = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_sequence_length)
print('Sequences padded:::')
print(X[0])

Some sequences:::
[[1015, 3783, 40468, 6668, 36602, 1015, 2145, 36243, 41799, 5579, 65733, 33712, 29475, 1015, 26876, 2333, 62334, 5855, 44057, 2886, 2324, 2898, 13949, 10222, 69194, 1716, 236, 59359, 881, 17248, 2333, 6, 22079, 61408, 2133, 31287, 61408, 6526, 13949, 61174, 56956, 17248, 2886, 31287, 61408, 81, 17248, 96, 61174, 29, 36681, 17248, 385, 45190, 17248, 37931, 13949, 14990, 17248, 959, 4368, 17248, 3783, 61174, 25431, 300, 30664, 169, 49208, 1015, 96, 57412, 26879, 146, 13949, 68, 2899, 5579, 3, 492], [17248, 1293, 30556, 44057, 1015, 12288, 169, 31287, 1015, 439, 36602, 50, 1015, 46, 225, 8745, 331, 1015, 2349, 66, 31124, 1015, 33744, 13949, 104, 6635, 17248, 1293, 34825, 59359, 1151, 13949, 17248, 1588, 48230, 29220, 59359, 61174, 626, 6760, 31124, 1015, 5023, 45151, 1015, 598, 17248, 1050, 40468, 57766, 58444, 7989, 5579, 17232, 30664, 17248, 6211, 170, 17248, 6, 40105, 17248, 44, 44057, 61408, 2133, 24625, 17248, 1293, 62659, 7065, 17248, 1680, 13949, 40468, 47041, 312

# Create the y vector

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df.Origin_Ethnicity)
y = y.reshape(len(y),1)
le.classes_

array(['American', 'Assamese', 'Australian', 'Bangladeshi', 'Bengali',
       'Bollywood', 'British', 'Canadian', 'Chinese', 'Egyptian',
       'Filipino', 'Hong Kong'], dtype=object)

# Create the model pieces

## Convolution layers

In [18]:
from tensorflow.keras.layers import Embedding, Input, Conv1D, MaxPooling1D, Flatten, Dense
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(le.classes_), activation='softmax')(x)

## RNN

In [14]:
from tensorflow.keras.layers import Embedding, Input, SimpleRNN, Dense
embedding_layer = Embedding(len(word_index) + 1,
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=max_sequence_length,
                            trainable=False)
sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = SimpleRNN(64,return_sequences=True)(embedded_sequences)
x = SimpleRNN(64)(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(le.classes_), activation='softmax')(x)

# Compile the model

In [19]:
from tensorflow.keras.models import Model
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['acc'])
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 50, 50)            4760400   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 46, 128)           32128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 9, 128)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 5, 128)            82048     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 1, 128)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)               0   

# Train the model

In [23]:
model.fit(X, 
          tf.keras.utils.to_categorical(y),
          batch_size=20,epochs=100,validation_split=0.3)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
124/894 [===>..........................] - ETA: 3s - loss: 0.0028 - acc: 0.9984

KeyboardInterrupt: 