In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **IMDB Films Review Sentiment Text Classification using Keras**
The text data, used in this notebook, is the film reviews data from IMDB resource. In this python notebook i would like to create a mechanism for **text classification** into two groups (positive, negative), based on the sentiment.  We are going to accomplish this task by building neural networks using the Keras framework.

## **Import packages**
We start by importing the packages. 

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import re
from nltk.stem import WordNetLemmatizer

## **Load and inspect data** 
Let's load the data and inspect its structure.


In [None]:
df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv",
                 sep=',')

In [None]:
df.head()

Let us find the factors for sentiment values and convert them to integer values (0 and 1) for the algorithm to be able to process.

In [None]:
# ----- Get labels -----
y = np.int32(df.sentiment.astype('category').cat.codes.to_numpy())
# ----- Get number of classes -----
num_classes = np.unique(y).shape[0]

In [None]:
print(y)

## Preprocess text

Before we apply any algorithm, we need to have a cleaned data set. In our case with text data, it is essential to perform preprocessing steps in order to increase the performance of the algorithms. The steps we perform are: 
* remove br tags
* remove all single characters
* substituting multiple spaces with single space
* removing prefixed 'b'
* converting to lowercase

The final preprocessing step is the lemmatization. In lemmatization, we reduce the word into dictionary root form. For instance "cats" is converted into "cat". Lemmatization is done in order to avoid creating features that are semantically similar but syntactically different. For instance, we don't want two different features named "cats" and "cat", which are semantically similar, therefore we perform lemmatization.

In [None]:
stemmer = WordNetLemmatizer()
def custom_standardization(text):
    
    text = re.sub('<br />', ' ', str(text))
    
    text = re.sub(r'\W', ' ', str(text))
    
    # remove all single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    
    # substituting multiple spaces with single space
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # removing prefixed 'b'
    text = re.sub(r'^b\s+', '', text)
    
    # converting to Lowercase
    text = text.lower()
    
    # lemmatization
    text = text.split()

    text = [stemmer.lemmatize(word) for word in text]
    text = ' '.join(text)
    
    return text
    pass

In [None]:
df['Cleaned_Text'] = df.review.apply(custom_standardization)

In [None]:
df['Cleaned_Text'].head()

Further we are going to use a word embedding representation for our text data reviews. A word embedding is a class of approaches for representing words and documents using a dense vector representation, where a vector represents the projection of the word into a continuous vector space. The position of a word within the vector space is learned from text and is based on the words that surround the word when it is used. The position of a word in the learned vector space is referred to as its embedding. More information about word embeddings can be found here: 
* https://www.tensorflow.org/tutorials/text/word_embeddings

Before we use word embeddings, we need to convert our words into integers first. To do this we define the size of our vocabulary (max_features) and find the most occuring words in our data, to be used in the dictionary. After that, we use a tokenizer to represent each review as an integer vector corresponding to the dictionary.
Tokenizer vectorize a text corpus into a list of integers. Each integer maps to a value in a dictionary that encodes the entire corpus, with the keys in the dictionary being the vocabulary terms themselves. 

In [None]:
# ----- Prepare text for embedding -----
max_features = 10000

In [None]:
# ----- Get top 10000 most occuring words in list-----
results = Counter()
df['Cleaned_Text'].str.split().apply(results.update)
vocabulary = [key[0] for key in results.most_common(max_features)]

# ----- Create tokenizer based on your top 10000 words -----
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(vocabulary)



In [None]:
df['Cleaned_Text']

In [None]:

# ----- Convert words to ints and pad -----
X = tokenizer.texts_to_sequences(df['Cleaned_Text'].values)
X = pad_sequences(X)


# ----- Split into Train, Test, Validation sets -----
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## **Define and train a model**

To crete an embedding layer in Keras we need to give as input 3 parameters. 
* input_dim: This is the size of the vocabulary in the text data
* output_dim: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger
* input_length: This is the length of input sequences, as you would define for any input layer of a Keras model

In [None]:
output_dim = 16
max_input_lenght = X.shape[1]

We define a sequential keras model, starting with an embedding layer with output dimension 16. After that we use GlobalAveragePooling1D layer and one Dense layer. 
A GlobalAveragePooling1D layer returns a fixed-length output vector for each example by averaging over the sequence dimension. This allows the model to handle input of variable length, in the simplest way possible.
This fixed-length output vector is piped through a fully-connected (Dense) layer with 16 hidden units. Finally, the dense layer with two classes is used for classification.

In [None]:
# ----- Define model -----
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=max_features, output_dim=output_dim, input_length=max_input_lenght))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

# ----- Compile model -----
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=["accuracy"])




In [None]:
model.summary()

In [None]:
# ----- Train model -----
history_1 = model.fit(X_train, y_train, batch_size=8,epochs=20, validation_data=(X_val, y_val))



## **Evaluate the model**

In [None]:
# ----- Evaluate model -----
probabilities = model.predict(X_test)
pred = np.argmax(probabilities, axis=1)

print(" ")
print("Results")

accuracy = accuracy_score(y_test, pred)

print('Accuracy: {:.4f}'.format(accuracy))
print(" ")
print(classification_report(y_test, pred))

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history_1)

We obtained quite good results of 89% accuracy on a test data set with a simple NN design. With the plots above we also see that we are not doing too much overfitting.

As a next step the parameters of the neural network can be optimized by performing a grid search. 