In [17]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import functools
import string
import joblib
import warnings
import itertools
import scipy.stats as st
from math import sqrt
import spacy
import string
from scipy.special import expit, logit

import sklearn.preprocessing           as pre
import sklearn.pipeline                as pipe
import sklearn.impute                  as imp
import sklearn.compose                 as pipe2
import sklearn.dummy                   as dum
import sklearn.metrics                 as metr
import sklearn.linear_model            as lm
import sklearn.model_selection         as cv
import sklearn.tree                    as tree
import sklearn.ensemble                as ensem
import sklearn.base                    as base
# import sklearn.feature_extraction.text as text
import sklearn.decomposition           as decomp
import sklearn.naive_bayes             as bayes
import sklearn.svm                     as svm

%matplotlib inline
plt.ioff()
sns.set_style('darkgrid')
sns.set_context('talk')

from utilities import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dense, LSTM, Embedding
from keras.layers import Dropout, Activation, Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.preprocessing import text, sequence

In [None]:
# Load the Data
col_rename = lambda df : df.rename(lambda s : s.lower().replace(' ', '_'), axis = 'columns')

data           = col_rename(pd.read_csv('movie_train.csv', index_col=0))
out_of_sample  = col_rename(pd.read_csv('movie_test.csv' , index_col=0))
oos = out_of_sample # alias

In [20]:
# Prepare the target
label_binarizer = pre.LabelBinarizer()
y_all = label_binarizer.fit_transform(data.genre)

array(['action', 'adventure', 'comedy', 'crime', 'drama', 'horror',
       'romance', 'thriller', 'western'], dtype='<U9')

In [105]:
# Prepare the inputs
seq_len = 512
vocab_size = 20000

tokenizer = text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['plot'])
temp   = tokenizer.texts_to_sequences(data['plot'])
X_all  = np.array(sequence.pad_sequences(temp, maxlen=seq_len))

In [106]:
X_train, X_test, y_train, y_test = cv.train_test_split(X_all, y_all, random_state = 42)

# Alias
X = X_train
y = y_train

In [107]:
from keras.models import Sequential

In [108]:
from keras.layers import SpatialDropout1D

In [109]:
embedding_size = 128

model = Sequential([
    Embedding(vocab_size, embedding_size, input_length = seq_len),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation = 'relu', ),
    Dropout(0.2),
    Dense(9, activation = 'softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 512, 128)          2560000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 512, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_5 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 9)                 585       
Total params: 2,614,153
Trainable params: 2,614,153
Non-trainable params: 0
____________________________________________

In [110]:
# Timer.start()
# model.fit(X, y, epochs=10, batch_size=64, validation_split=0.1, )
# Timer.end()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 7209 samples, validate on 802 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
3009.618 seconds elapsed


In [111]:
dump(model, './models/lstm-10.joblib')

['./models/lstm-10.joblib']

In [112]:
y_pred = model.predict(X_test)

In [113]:
y_pred

array([[4.7190595e-05, 6.3265958e-07, 9.1674458e-04, ..., 6.1362755e-04,
        1.2023610e-04, 8.4919208e-08],
       [9.0283269e-05, 7.1108201e-07, 6.9276168e-04, ..., 1.3426220e-03,
        9.4089482e-05, 7.6597161e-08],
       [6.3704669e-05, 6.9787801e-04, 8.6028689e-01, ..., 1.3770338e-03,
        4.3710200e-03, 3.3507506e-05],
       ...,
       [3.9452785e-03, 1.2819510e-04, 1.7320499e-02, ..., 9.5915750e-02,
        1.1666388e-03, 1.8871109e-05],
       [6.5834211e-05, 2.0830344e-06, 2.5563759e-03, ..., 1.8407014e-03,
        2.1673480e-04, 3.2040691e-07],
       [3.4165234e-04, 4.3226642e-06, 7.5082318e-04, ..., 2.2828295e-03,
        4.8908531e-03, 1.2573242e-06]], dtype=float32)

In [114]:
y_test

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [115]:
label_binarizer.inverse_transform(y_test)

array(['comedy', 'drama', 'comedy', ..., 'romance', 'comedy', 'drama'],
      dtype='<U9')

In [116]:
metr.log_loss(label_binarizer.inverse_transform(y_test), y_pred)

3.2799840709960058

In [117]:
def hard_predict(label_binarizer, probs):
    tmp = probs.argmax(axis = 1)
    class_dict = dict(enumerate(label_binarizer.classes_))
    @np.vectorize
    def mapper(label_index):
        return class_dict[label_index]
    return mapper(tmp)

In [118]:
print(metr.classification_report(label_binarizer.inverse_transform(y_test), 
                           hard_predict(label_binarizer, y_pred)
                          ))

              precision    recall  f1-score   support

      action       0.22      0.17      0.19       209
   adventure       0.14      0.08      0.11        96
      comedy       0.46      0.38      0.41       692
       crime       0.08      0.13      0.10        79
       drama       0.44      0.48      0.46       952
      horror       0.44      0.38      0.41       216
     romance       0.15      0.20      0.17       144
    thriller       0.11      0.16      0.13       161
     western       0.44      0.34      0.38       122

    accuracy                           0.35      2671
   macro avg       0.27      0.26      0.26      2671
weighted avg       0.37      0.35      0.36      2671



In [50]:
# Timer.start()
# model.fit(X, y, epochs=1, batch_size=32, validation_split=0.1)
# Timer.end()

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 7209 samples, validate on 802 samples
Epoch 1/1
371.752 seconds elapsed
