# 1. Reading the Dataset

In [None]:
import pandas as pd

# Selecting a subset of data to be faster in demonstration
train_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv').head(4000)
valid_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv').head(500)
test_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv').head(500)
print('Train: '+ str(len(train_df)))
print('Valid: '+ str(len(valid_df)))
print('Test: '+ str(len(test_df)))
train_df.head(10)

## Some text pre-processing

In [None]:
# Turnig all text to lowercase
train_df['text'] = train_df['text'].str.lower()
valid_df['text'] = valid_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()
train_df.head()

In [None]:
# Removing ponctuation
import string

exclude = set(string.punctuation) 

def remove_punctuation(x): 
    try: 
        x = ''.join(ch for ch in x if ch not in exclude) 
    except: 
        pass 
    return x 

train_df['text'] = train_df['text'].apply(remove_punctuation)
valid_df['text'] = valid_df['text'].apply(remove_punctuation)
test_df['text'] = test_df['text'].apply(remove_punctuation)
train_df.head()

In [None]:
# Removing stopwords
from nltk.corpus import stopwords

stop = stopwords.words('english')

train_df['text'] = train_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
valid_df['text'] = valid_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
test_df['text'] = test_df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
train_df.head()

# 2. Sentences as Bag of Words

## Classical Model with TF-IDF and SVM

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create feature vectors for every sentence
vectorizer = TfidfVectorizer(#min_df = 5,
                             #max_df = 0.8,
                             max_features = 20000,
                             sublinear_tf = True,
                             use_idf = True)#, stop_words='english')#vocabulary = list(embeddings_index.keys()

train_vectors = vectorizer.fit_transform(train_df['text'])
valid_vectors = vectorizer.transform(valid_df['text'])
test_vectors = vectorizer.transform(test_df['text'])

In [None]:
from sklearn import svm
# SVM
classifier_linear = svm.SVC(kernel='linear')
#Train
classifier_linear.fit(train_vectors, train_df['label'])

In [None]:
from sklearn.metrics import classification_report

predictions = classifier_linear.predict(test_vectors)
# results
report = classification_report(test_df['label'], predictions)
print(report)

## Changing the classifier by a NN model

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

# Defining the NN model
model = Sequential()
model.add(Dense(20, input_shape=(train_vectors.shape[1],), activation='relu'))
model.add(Dropout(0.3))
#model.add(Dense(5, activation='relu'))
#model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
# compile network
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train
#Salvar o melhor modelo
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='model.h5',
        monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10,verbose=1)
]

history = model.fit(
    train_vectors.toarray(), train_df['label'], 
    epochs=20, 
    verbose=1,
    callbacks = callbacks_list,
    validation_data=(valid_vectors.toarray(), valid_df['label']))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
from tensorflow.keras.models import load_model
# Load the best saved model
model = load_model('model.h5')

y_pred = model.predict_classes(valid_vectors.toarray())
print(classification_report(valid_df['label'], y_pred, target_names=['0','1']))

# 3. Sentences as stream of words

In [None]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string

# Model constants.
max_features = 20000
embedding_dim = 100
sequence_length = 500

# function to stardardize texts
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    #stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    #return tf.strings.regex_replace(
    #    stripped_html, "[%s]" % re.escape(string.punctuation), ""
    #)
    return lowercase

# normalize, split, and map strings to integers
vectorize_layer = TextVectorization(
    #standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Creating the vocabulary
vectorize_layer.adapt(train_df['text'].values)

In [None]:
# Vetorizing all the texts
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)


# Vectorize the data.
train_ds = vectorize_text(train_df['text'])
valid_ds = vectorize_text(valid_df['text'])
test_ds = vectorize_text(test_df[ 'text'])


In [None]:
print(train_df['text'][0])
print(train_ds[0])
print(vectorize_text(['beautiful pretty']))

## A Convolutional Model

In [None]:
from keras import layers

model = Sequential()
# A integer input for vocab indices.
model.add(layers.Input(shape=(None,), dtype="int64"))
# Layer to map those vocab indices into a space of dimensionality 'embedding_dim'.
model.add(layers.Embedding(max_features, embedding_dim))
# Conv1D + global max pooling
model.add(layers.Conv1D(50, 7, padding="valid", activation="relu", strides=3))#200
#model.add(layers.Conv1D(100, 7, padding="valid", activation="relu", strides=3))
model.add(layers.GlobalMaxPooling1D())

# Common hidden layer for final classification
model.add(layers.Dense(10, activation="relu"))#100
model.add(layers.Dropout(0.5))
# Single unit output layer with sigmoid activation
model.add(layers.Dense(1, activation="sigmoid", name="predictions"))

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train
#Salvar o melhor modelo
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='model.h5',
        monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,verbose=1)
]

history = model.fit(
    train_ds, train_df['label'], 
    epochs=20, 
    verbose=1,
    callbacks = callbacks_list,
    validation_data=(valid_ds, valid_df['label']))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
model = load_model('model.h5')

y_pred = model.predict_classes(test_ds)
print(classification_report(test_df['label'], y_pred, target_names=['0','1']))

## A LSTM Recursive Model

In [None]:
model = Sequential()
# Input for variable-length sequences of integers
model.add(keras.Input(shape=(None,), dtype="int64"))
# Embed each integer in a embedding_dim vector
model.add(layers.Embedding(max_features, embedding_dim))
# Add 2 bidirectional LSTMs
#model.add(layers.Bidirectional(layers.LSTM(32, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(32)))
# Add a classifier
model.add(layers.Dense(1, activation="sigmoid"))

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train
#Salvar o melhor modelo
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='model.h5',
        monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,verbose=1)
]

history = model.fit(
    train_ds, train_df['label'], 
    epochs=20, 
    verbose=1,
    callbacks = callbacks_list,
    validation_data=(valid_ds, valid_df['label']))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
model = load_model('model.h5')

y_pred = model.predict_classes(test_ds)
print(classification_report(test_df['label'], y_pred, target_names=['0','1']))

# 4. Using Word Embeddings

In [None]:
voc = vectorize_layer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
word_index

In [None]:
test = ["pretty", "cat", "sat", "near", "yellow", "cat"]
[word_index[w] for w in test]

## Glove Embeddings

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

In [None]:
!ls

In [None]:
embeddings_index = {}
with open("glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

In [None]:
embeddings_index["cat"]

In [None]:
num_tokens = len(voc) + 2
#embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix to be used in a Embedding layer
# matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
embeddings_index["movie"]

In [None]:
word_index["movie"]

In [None]:
embedding_matrix[3]

In [None]:
# Visualizing with T-SNE
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline


max_w = 1000
# Creates and TSNE model and plots it
labels = []
tokens = []
for word, i in word_index.items():
    if i == max_w:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        tokens.append(embedding_vector)
        labels.append(word)
tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])
plt.figure(figsize=(16, 16))
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i],
        xy=(x[i], y[i]),
        xytext=(5, 2),
        textcoords='offset points',
        ha='right',
        va='bottom')
plt.show()


In [None]:
# load the pre-trained word embeddings matrix into an Embedding layer. trainable = False
embedding_layer = layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,
)

In [None]:
model = Sequential()
# Input for variable-length sequences of integers
model.add(keras.Input(shape=(None,), dtype="int64"))
# load the pre-trained word embeddings matrix into an Embedding layer. trainable = False
model.add(layers.Embedding(num_tokens,embedding_dim, embeddings_initializer=keras.initializers.Constant(embedding_matrix), trainable=False))
# Add 2 bidirectional LSTMs
model.add(layers.Bidirectional(layers.LSTM(32, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(32)))
# Add a classifier
model.add(layers.Dense(1, activation="sigmoid"))

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train
#Salvar o melhor modelo
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='model.h5',
        monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,verbose=1)
]

history = model.fit(
    train_ds, train_df['label'], 
    epochs=20, 
    verbose=1,
    callbacks = callbacks_list,
    validation_data=(valid_ds, valid_df['label']))

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
model = load_model('model.h5')

y_pred = model.predict(test_ds)
y_pred = [1 if x >=0.5 else 0 for x in y_pred]
print(classification_report(test_df['label'], y_pred, target_names=['0','1']))

## Using a Transformer

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, name=None, **kwargs):
        super(TransformerBlock, self).__init__(name=name)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
            'rate': self.rate,
        })
        return config

In [None]:
#Two seperate embedding layers, one for tokens, one for token index (positions).
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, name=None, **kwargs):
        super(TokenAndPositionEmbedding, self).__init__(name=name)
        self.maxlen = maxlen
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        super(TokenAndPositionEmbedding, self).__init__(**kwargs)
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    
    def get_config(self):
        config = super().get_config().copy()
        config.update({
            'maxlen': self.maxlen,
            'vocab_size': self.vocab_size,
            'embed_dim': self.embed_dim
        })
        return config

In [None]:
#max_features = 20000
#embedding_dim = 100
#sequence_length = 500

#embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(sequence_length,))
embedding_layer = TokenAndPositionEmbedding(sequence_length, max_features, embedding_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
#model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train
#Salvar o melhor modelo
callbacks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='model.h5',
        monitor='val_loss', save_best_only=True, verbose=1),
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,verbose=1)
]

history = model.fit(
    train_ds, train_df['label'], 
    epochs=10, 
    verbose=1,
    callbacks = callbacks_list,
    validation_data=(valid_ds, valid_df['label'])
)

In [None]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,2, figsize=(16,8))
ax[0].plot(history.history['loss'], color='b', label="Training loss")
ax[0].plot(history.history['val_loss'], color='r', label="validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(history.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

In [None]:
model = load_model('model.h5', custom_objects={'TokenAndPositionEmbedding': TokenAndPositionEmbedding, 'TransformerBlock': TransformerBlock })

y_pred = model.predict(test_ds)
y_pred = [1 if x >=0.5 else 0 for x in y_pred]
print(classification_report(test_df['label'], y_pred, target_names=['0','1']))