<a href="https://colab.research.google.com/github/shazzad-hasan/tensorflow-projects/blob/main/tweet_emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook, we will train a model to recognize tweet emotion using the tweet emotion dataset. 

In [None]:
!pip install nlp

In [None]:
# import required libraries

import tensorflow as tf
import nlp

import numpy as np
import random
import matplotlib.pyplot as plt

%matplotlib inline

### Load and visualize dataset

In [None]:
# load tweet emotion dataset from huggingface nlp module
dataset = nlp.load_dataset("emotion")

In [None]:
dataset

In [None]:
train_data = dataset['train']
valid_data = dataset['validation']
test_data = dataset['test']

In [None]:
# obtain tweets and labels from the data
def get_tweet(data):
  tweets = [x['text'] for x in data] # obtain tweets
  labels = [x['label'] for x in data] # obtain tweets labels
  return tweets, labels

# obtain training tweets and labels 
tweets, labels = get_tweet(train_data)
valid_tweets, valid_labels = get_tweet(valid_data)

# let's see first 10 tweets and corresponding labels
for i in range(10):
  print("({}, {})".format(tweets[i], labels[i]))

### Pre-process the dataset

#### Prepare tweets

##### Tokenize the tweets

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# let's tokenize 10,000 most commonly used words
tokenizer = Tokenizer(num_words=1000, oov_token="UNK")
tokenizer.fit_on_texts(tweets)

In [None]:
# lets see the tokinization of a tweet
print("{} ---> {}".format(tweets[0], tokenizer.texts_to_sequences([tweets[0]])))

##### Padding and Truncating sequences

In [None]:
# check length of the tweets
lengths = [len(twt.split(" ")) for twt in tweets]
plt.hist(lengths, bins=len(set(lengths)))
plt.show()

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 50

def get_sequences(tokenizer, tweets):
  # tokenize tweets
  sequences = tokenizer.texts_to_sequences(tweets)
  # obtain padded and truncated sequences
  padded_seqs = pad_sequences(sequences, truncating="post", padding="post", maxlen=maxlen)
  return padded_seqs

In [None]:
# obtain padded and truncated training sequences
padded_train_seq = get_sequences(tokenizer, tweets)
padded_valid_seq = get_sequences(tokenizer, valid_tweets)


print("{}, \n{}".format(tweets[0], padded_train_seq[0]))

#### Prepare labels

In [None]:
classes = set(labels)
print(classes)

In [None]:
plt.hist(labels, bins=12)
plt.show()

In [None]:
class_to_idx = dict((c, i) for i, c in enumerate(classes))
idx_to_class = dict((v, k) for k, v in class_to_idx.items())

print(class_to_idx)
print(idx_to_class)

In [None]:
names_to_ids = lambda labels: np.array([class_to_idx.get(x) for x in labels])

train_labels = names_to_ids(labels)
valid_labels = names_to_ids(valid_labels)
print(train_labels[0])

### Define a neural network

In [None]:
model = tf.keras.models.Sequential([
            tf.keras.layers.Embedding(10000, 16, input_length=maxlen),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20, return_sequences=True)),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(20)),
            tf.keras.layers.Dense(6, activation="softmax")
])

model.summary()

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(padded_train_seq, train_labels, 
                    validation_data=(padded_valid_seq, valid_labels),
                    epochs=20,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=2)])