# Imports

In [22]:
import sys
from google.colab import drive
drive.mount('/content/gdrive')
sys.path.append('/content/gdrive/My Drive/Colab Notebooks/Multi_Label_Text_Classification')
base_dir = 'gdrive/My Drive/Colab Notebooks/Multi_Label_Text_Classification/'

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!pip3 install --quiet "tensorflow>=1.7"
!pip3 install --quiet tensorflow-hub

In [115]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import glob
import functools 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
#from sklearn.cross_validation import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
#from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
import scipy

import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *
rdm_seed = 29
np.random.seed(rdm_seed)
import keras
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
from keras.models import load_model
import keras.optimizers

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# MODELING

**Loading the input**

In [0]:
#mydata_train = pd.read_csv('./../Data/preprocessed/movies_genres_train_preprocessed.csv')
#mydata_test = pd.read_csv('./../Data/preprocessed/movies_genres_test_preprocessed.csv')
#mydata = pd.read_csv('../Data/movies_genres.csv', delimiter='\t')

mydata_train = pd.read_csv(base_dir+'Data/preprocessed/movies_genres_train_preprocessed.csv')
mydata_test = pd.read_csv(base_dir+'Data/preprocessed/movies_genres_test_preprocessed.csv')
mydata = pd.read_csv(base_dir+'Data/movies_genres.csv', delimiter='\t')

In [0]:
train_X, train_y = mydata_train['plot'], mydata_train.drop(['title', 'plot', 'plot_lang'], axis=1)
test_X, test_y = mydata_test['plot'], mydata_test.drop(['title', 'plot', 'plot_lang'], axis=1)

category_columns = train_y.columns

In [0]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"
embed = hub.Module(module_url)

## Obtain Plot Embedding

In [0]:
# embed_movie_plots(train_X, train_or_test='train')
# embed_movie_plots(test_X, train_or_test='test')

train_files = glob.glob(base_dir+"Data/preprocessed/embed_vector/*train*.npy")
train_vector_set = []
for file in train_files:
  train_vector_set.append(np.load(file))
train_vector = np.concatenate(train_vector_set)

test_files = glob.glob(base_dir+"Data/preprocessed/embed_vector/*test*.npy")
test_vector_set = []
for file in test_files:
    test_vector_set.append(np.load(file))
test_vector = np.concatenate(test_vector_set)

## LabelPowerset
We use a Neural Network model to make prediction among one of the 1505 unique genre combinations in our training data set. 
* Input Layer consists of 512 features
* Output Layer consists of 1505 nodes representing the each of the unique genre combinations  
  * We use softmax activation function since the classifier has to output one among the 1505 combinations
* Hidden Layers - number of nodes in the hidden layer has to be in between the number of input and output nodes for optimal performance. We select 1024 neurons
* Dropout of 20%. To avoid overfit, we randomly drop out 20% of the neurons in the hidden layer

In [0]:
# Creating a LUT for the 1505 labels
train_y_labels= train_y.groupby(list(category_columns)).ngroup()
y_labels_lut = train_y.copy(deep=True) 
y_labels_lut['Labels'] = train_y_labels
y_labels_lut = y_labels_lut.drop_duplicates()
y_labels_lut = y_labels_lut.reset_index(drop=True).set_index('Labels').sort_index()


# One-hot encoding the output labels
num_classes = y_labels_lut.shape[0]
train_y_onehot = np_utils.to_categorical(train_y_labels, num_classes = num_classes)

In [0]:
def gen_model(optimizer):
  model = Sequential()
  model.add(Dense(1024, activation='relu', input_shape=(512,)))
  model.add(Dropout(0.5))
  model.add(Dense(1505, activation='softmax'))
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
  return model
lr_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)

**Stochastic Gradient Descent Optimizer** 

In [123]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.SGD(lr=1))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.5.
Epoch 6/20
Epoch 7/20

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.25.
Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.125.
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0625.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.03125.


<keras.callbacks.History at 0x7fa33589ff28>

In [124]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.8,0.31,0.45,4321.0
Adult,0.0,0.0,0.0,11.0
Adventure,0.77,0.28,0.41,3496.0
Animation,0.79,0.58,0.67,3333.0
Biography,0.47,0.02,0.04,354.0
Comedy,0.71,0.48,0.57,7320.0
Crime,0.8,0.55,0.65,4453.0
Documentary,0.56,0.6,0.58,1863.0
Drama,0.83,0.72,0.77,11067.0
Family,0.78,0.31,0.45,4173.0


**Adam Optimizer**

In [131]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adam(lr=0.001))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/20
Epoch 9/20

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/20
Epoch 11/20
Epoch 12/20

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 13/20
Epoch 14/20

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa334b32588>

In [132]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.8,0.43,0.56,4321.0
Adult,0.5,0.09,0.15,11.0
Adventure,0.8,0.41,0.54,3496.0
Animation,0.82,0.64,0.72,3333.0
Biography,0.59,0.16,0.25,354.0
Comedy,0.73,0.56,0.63,7320.0
Crime,0.82,0.6,0.69,4453.0
Documentary,0.6,0.64,0.62,1863.0
Drama,0.85,0.74,0.79,11067.0
Family,0.79,0.4,0.53,4173.0


**RMSProp Optimizer**

In [125]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.RMSprop(lr=0.001))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 8/20
Epoch 9/20

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 10/20
Epoch 11/20

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa3358bb2e8>

In [126]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.81,0.22,0.35,4321.0
Adult,0.0,0.0,0.0,11.0
Adventure,0.81,0.2,0.32,3496.0
Animation,0.78,0.53,0.63,3333.0
Biography,0.64,0.03,0.05,354.0
Comedy,0.7,0.5,0.58,7320.0
Crime,0.79,0.54,0.64,4453.0
Documentary,0.58,0.6,0.59,1863.0
Drama,0.84,0.71,0.77,11067.0
Family,0.84,0.22,0.35,4173.0


**Adagrad**

In [127]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adagrad(lr=0.01))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 11/20
Epoch 12/20

Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 13/20
Epoch 14/20

Epoch 00014: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.0006249999860301614.
Epoch 20/20


<keras.callbacks.History at 0x7fa334de9be0>

In [128]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.81,0.34,0.47,4321.0
Adult,0.0,0.0,0.0,11.0
Adventure,0.78,0.3,0.43,3496.0
Animation,0.81,0.59,0.68,3333.0
Biography,0.65,0.09,0.15,354.0
Comedy,0.71,0.5,0.59,7320.0
Crime,0.8,0.56,0.66,4453.0
Documentary,0.55,0.61,0.58,1863.0
Drama,0.84,0.72,0.77,11067.0
Family,0.78,0.29,0.42,4173.0


**Adadelta**

In [129]:
epochs, batch_size = 20, 128
model = gen_model(keras.optimizers.Adadelta(lr=1.0))
model.fit(train_vector, train_y_onehot,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.5.
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.25.
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

Epoch 00019: ReduceLROnPlateau reducing learning rate to 0.125.
Epoch 20/20


<keras.callbacks.History at 0x7fa334b1f550>

In [130]:
y_pred = model.predict(test_vector)
y_pred_label = pd.DataFrame(np.argmax(y_pred, axis=1))
predictions = pd.merge(y_pred_label, y_labels_lut, how='left', left_on=0, right_on='Labels')[category_columns]
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.81,0.28,0.42,4321.0
Adult,0.0,0.0,0.0,11.0
Adventure,0.77,0.25,0.38,3496.0
Animation,0.8,0.58,0.67,3333.0
Biography,0.53,0.03,0.05,354.0
Comedy,0.71,0.49,0.58,7320.0
Crime,0.8,0.54,0.65,4453.0
Documentary,0.57,0.58,0.58,1863.0
Drama,0.84,0.71,0.77,11067.0
Family,0.77,0.26,0.39,4173.0


**Observations/Conclusions**
* Predictions using Sentence Embedding with Neural Networks doesnt really produce predictions as accurate as the simple ML models which used TF-IDF vectorizer
* Adam Optimizer seems to perform best among the ones tried with a F1 score of 0.62

## Binary Relevance
Here we build an predictor for each genre separately. In other words, the output layer will have 28 nodes - each corresponding to a genre. We will use a threshold for each genre to make predictions whether the plot falls into that genre or not

In [189]:
prob_thresh = (train_y.sum()/train_y.shape[0]).clip(upper=0.5)
prob_thresh

Action         0.085343
Adult          0.000530
Adventure      0.071494
Animation      0.085152
Biography      0.010925
Comedy         0.281333
Crime          0.113042
Documentary    0.107627
Drama          0.369008
Family         0.119008
Fantasy        0.047260
Game-Show      0.016901
History        0.021606
Horror         0.018194
Music          0.023132
Musical        0.004376
Mystery        0.083838
News           0.034587
Reality-TV     0.112194
Romance        0.154633
Sci-Fi         0.059371
Short          0.004620
Sport          0.016096
Talk-Show      0.047090
Thriller       0.059318
War            0.010798
Western        0.024541
dtype: float64

In [0]:
def gen_model_genre(optimizer):
  model = Sequential()
  model.add(Dense(800, activation='relu', input_shape=(512,)))
  model.add(Dropout(0.25))
  model.add(Dense(27, activation='sigmoid'))
  model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
  return model
lr_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.0001)

In [140]:
epochs, batch_size = 20, 128
model = gen_model_genre(keras.optimizers.Adam(lr=0.001))
model.fit(train_vector, train_y,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=0.3,
          callbacks=[lr_reduction])

Train on 66060 samples, validate on 28312 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

Epoch 00005: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 6/20
Epoch 7/20

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/20
Epoch 9/20

Epoch 00009: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 10/20
Epoch 11/20

Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fa334232d68>

In [173]:
y_pred = model.predict(test_vector)
predictions = pd.DataFrame(index=test_y.index, columns=test_y.columns)
for i in range(y_pred.shape[0]):
  predictions.iloc[i,:] = (y_pred[i,:]>prob_thresh).map({True:1, False:0})
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Support
Action,0.59,0.58,0.58,4321.0
Adult,0.01,0.64,0.03,11.0
Adventure,0.55,0.67,0.6,3496.0
Animation,0.69,0.8,0.74,3333.0
Biography,0.12,0.61,0.2,354.0
Comedy,0.76,0.23,0.35,7320.0
Crime,0.73,0.64,0.68,4453.0
Documentary,0.57,0.64,0.61,1863.0
Drama,0.82,0.09,0.16,11067.0
Family,0.64,0.51,0.57,4173.0
