In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import tensorflow as tf

from keras.utils import to_categorical

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding
from keras.models import Model

np.random.seed(456789)

# Movie Genre Classification
## Using Deep learning to predict the genre of a movie based on it's plot

In [None]:
movie_data = pd.read_csv("../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv")

In [None]:
movie_data.head()

In [None]:
movie_data.shape

In [None]:
count_by_genre = movie_data['Genre'].groupby(movie_data['Genre']) \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=False) \
                             .head(20)

In [None]:
count_by_genre

In [None]:
genres = ['drama', 'comedy', 'horror', 'action', 'thriller',
          'romance', 'western', 'crime', 'adventure', 'musical',
          'crime drama', 'romantic comedy', 'science fiction', 'mystery', 'animation']

In [None]:
len(genres)

In [None]:
case = movie_data["Genre"].isin(genres)
movie_data_selected = movie_data[case]
movie_data_selected.reset_index(inplace=True)
movie_data_selected.shape

In [None]:
movie_data_selected = movie_data_selected.sort_values("Genre")

In [None]:
movie_data_selected.head()

In [None]:
wordcounter = movie_data_selected['Plot'].apply(lambda x: x.count(' '))
print("Average number of words per plot: ", int(wordcounter.mean()))
print("Standard deviation of the words: ", int(wordcounter.std()))

In [None]:
plt.hist(wordcounter, bins='fd')
plt.show()

In [None]:
nltk.download('stopwords')

In [None]:
stopwords = stopwords.words('english')

Text cleaning:
1. Convert everything to lowercase
2. Remove (\\'s)
3. Remove (\r\n)
4. Remove the text inside parenthesis ()
5. Remove punctuations and special characters
6. Remove stopwords
7. Remove short words

In [None]:
def clean_text(text):
    '''
    Clean a string input and prepare it for next steps.
    '''
    text = text.lower()
    # Find and clear all ('s)
    pattern_s = re.compile("\'s")
    text = re.sub(pattern_s, '', text)
    # Find and clear all (\r\n)
    pattern_rn = re.compile("\\r\\n")
    text = re.sub(pattern_rn, '', text)
    # Find and remove all parentheses and their contents
    pattern_parentheses = re.compile("\(.*?\)")
    text = re.sub(pattern_parentheses, '', text)
    # Find and remove punctuation and special characters
    pattern_punct = re.compile(r"[^\w\s]")
    text = re.sub(pattern_punct, '', text)
    # Broke into tokens and remove stopwords
    tokens = [w for w in text.split() if not w in stopwords]
    # Remove short words (under 3 characters) from the tokens
    long_words = []
    for token in tokens:
        if len(token) >= 3:
            long_words.append(token)
    # Join the tokens back together
    cleaned_text = (" ".join(long_words)).strip()
    return cleaned_text

In [None]:
# Clean the plot text and add it to the dataframe
cleaned_plot = []
for plot in movie_data_selected["Plot"]:
    cleaned_plot.append(clean_text(plot))

In [None]:
movie_data_selected["cleaned_plot"] = cleaned_plot

In [None]:
stemmer = PorterStemmer()
movie_data_selected["stemmed_plot"] = movie_data_selected["cleaned_plot"].str.split().apply(lambda x: ' '.join([stemmer.stem(w) for w in x]))

In [None]:
movie_data_selected.head()

In [None]:
movie_data_selected.groupby(movie_data_selected["Genre"]).size()

In [None]:
print(f"80% of the data for training: {int(movie_data_selected.shape[0]*0.8)} samples")
print(f"10% for training and 10% for validation: {int(movie_data_selected.shape[0]*0.1)} samples each")

In [None]:
grouped_by_genre = movie_data_selected.groupby(movie_data_selected["Genre"], group_keys=False)

train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()
# Not exactly what I need, but the general idea is here
for g in genres:
    train_range = int(grouped_by_genre.get_group(g).shape[0]*0.8)
    val_range = int(grouped_by_genre.get_group(g).shape[0]*0.9)
    train_df = train_df.append(grouped_by_genre.get_group(g).iloc[0:train_range, :])
    val_df = val_df.append(grouped_by_genre.get_group(g).iloc[train_range:val_range, :])
    test_df = test_df.append(grouped_by_genre.get_group(g).iloc[val_range:, :])
# Combine in one dataframe
comb_df = pd.DataFrame()
comb_df = comb_df.append(train_df)
comb_df = comb_df.append(val_df)
comb_df = comb_df.append(test_df)

In [None]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)
print(comb_df.shape)

In [None]:
# Initalise tokenizer with the original data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(comb_df["Plot"]))
sequences = tokenizer.texts_to_sequences(list(comb_df["Plot"]))
max_len = np.max([len(sequence) for sequence in sequences])
print("Maximum length sequence is", max_len)
word_index = tokenizer.word_index
print(f"{len(word_index)} unique tokens have been found.")
token_data = pad_sequences(sequences, maxlen=max_len, padding='post')
print("Shape of the token data tensor:", token_data.shape)

In [None]:
# With the cleaned data
tokenizer_clean = Tokenizer()
tokenizer_clean.fit_on_texts(list(comb_df["stemmed_plot"]))
sequences_clean = tokenizer_clean.texts_to_sequences(list(comb_df["stemmed_plot"]))
max_len_clean = np.max([len(sequence) for sequence in sequences_clean])
print("Maximum length sequence is", max_len_clean)
word_index_clean = tokenizer_clean.word_index
print(f"{len(word_index_clean)} unique tokens have been found.")
token_data_clean = pad_sequences(sequences_clean, maxlen=max_len_clean, padding='post')
print("Shape of the token data tensor:", token_data_clean.shape)

In [None]:
token_data[1952]

It looks like the tokenizer with the cleaned data found a little more unique tokens than the other. Maybe for now we will use the one with the original data.

In [None]:
sanity_check_index = {v: k for k, v in tokenizer.word_index.items()}
print(sequences[100])
print(' '.join([sanity_check_index[word_index] for word_index in sequences[100]]))
print(token_data[100][0])
print(token_data[100][-1])
print(' '.join([sanity_check_index[word_index] for word_index in token_data[100] if word_index!=0]))

In [None]:
train_data = token_data[0:train_range]
val_data = token_data[train_range:val_range]
test_data = token_data[val_range:]

train_labels = train_df["Genre"]
val_labels = val_df["Genre"]
test_labels = test_df["Genre"]

In [None]:
train_labels.value_counts()

In [None]:
val_labels.value_counts()

In [None]:
test_labels.value_counts()

In [None]:
train_labels = pd.factorize(train_labels)
val_labels = pd.factorize(val_labels)
test_labels = pd.factorize(test_labels)

In [None]:
train_labels = to_categorical(train_labels[0], num_classes=len(genres))
val_labels = to_categorical(val_labels[0], num_classes=len(genres))
test_labels = to_categorical(test_labels[0], num_classes=len(genres))

In [None]:
train_labels.shape

### GloVe

In [None]:
embeddings = {}
index = 0
with open ('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt') as file:
    for embedding_line in file:
        line_split = embedding_line.split()
        coefs = np.asarray(line_split[1:], dtype='float32')
        embeddings[line_split[0]] = coefs
        index += 1

In [None]:
embeddings_matrix = np.zeros((len(word_index)+1, len(embeddings['a'])))
for word, i in word_index.items():
    if word in embeddings:
        embeddings_matrix[i] = embeddings[word]

In [None]:
print('Word #125', sanity_check_index[125])
print('Index of if', word_index['if'])
print('Embedding in embeddings list: ', embeddings['if'][:5])
print('Embedding in embeddings matrix: ', embeddings_matrix[125][:5])

In [None]:
embedding_layer = Embedding(len(word_index)+1, 
                            len(embeddings['a']), 
                            weights=[embeddings_matrix], 
                            input_length=max_len, 
                            trainable=False)
embedding_layer_without_GloVe = Embedding(len(word_index)+1, 
                                          len(embeddings['a']), 
                                          weights=[embeddings_matrix], 
                                          input_length=max_len)

In [None]:
# Check the layer
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
embedding_only_Model = Model(sequence_input, embedded_sequences)

print('Manual Embeddings Result: ', [list(embeddings[sanity_check_index[x]][:3]) if sanity_check_index[x] in embeddings else [0, 0, 0] for x in sequences[500]][-5:])
print()
print('Model Embeddings Result: ', embedding_only_Model.predict(np.array(token_data[500]).reshape(1, max_len))[0, -5:, :3])

In [None]:
token_data[500]

TODO:
1. Sanity check of the tokenizer - DONE
2. Train-test-split - DONE
3. GloVe emeddings
4. LSTM-CNN model
