**The same task as Lab 1 except using `RNNs`**

**Watch out!<br>
Preproceesing here is a a bit different**

**Dataset**
labeled dataset collected from twitter

**Objective**
classify tweets containing hate speech from other tweets. <br>
0 -> no hate speech <br>
1 -> contains hate speech <br>

**Total Estimated Time = 60-90 Mins**

> **Load the `clean data` preprocessed in `Lab 1`, then handle it to be used with `RNNs`**

# Import Libraries

In [1]:
import pandas as pd
import os
import urllib

In [2]:
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense, Bidirectional, TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import Input
import numpy as np

In [3]:
SEED = 42

# Load Dataset

In [4]:
# Check whether we're using google colab
using_colab = None
CWD = os.getcwd()
if os.path.exists('/content'):
  using_colab = True
else:
  using_colab = False
  PWD = os.path.dirname(CWD)

here


In [None]:
# # Download the dataset
# !gdown https://drive.google.com/u/0/uc?id=1Ir9Rwy1iBOfDfIoaQWDGgFT0W-YSNmot&export=download

In [5]:
if using_colab:  #Check if using Colab
    data_folder_path = CWD #+ "/data/"
else:
    data_folder_path = PWD + "data/"
train_csv_file_path = data_folder_path + "arabic_dialects_train.csv"
test_csv_file_path = data_folder_path + "arabic_dialects_test.csv"

In [6]:
train_url = 'https://github.com/AmgadHasan/arabic-dialect-detection/raw/main/data/arabic_dialects_train.csv'
test_url = 'https://github.com/AmgadHasan/arabic-dialect-detection/raw/main/data/arabic_dialects_test.csv'

In [7]:
if not os.path.isfile(train_csv_file_path):
    # If the file doesn't exist, download it from the URL
    print("Downloading file...")
    urllib.request.urlretrieve(train_url, train_csv_file_path)
    print("File downloaded.")
else:
    print("File exists.")

Downloading file...
File downloaded.


In [8]:
if not os.path.isfile(test_csv_file_path):
    # If the file doesn't exist, download it from the URL
    print("Downloading file...")
    urllib.request.urlretrieve(test_url, test_csv_file_path)
    print("File downloaded.")
else:
    print("File exists.")

Downloading file...
File downloaded.


In [9]:
df = pd.read_csv(train_csv_file_path, lineterminator='\n')
pd.set_option('display.max_colwidth', 0)
df.head()

Unnamed: 0,tweet,label
0,انتي جافية والا الغلا ماتبيسؤالموسيقي,LY
1,باركوا لجوجو اجاها عريس,LB
2,لابأس عليك يالزينه ان شاء الله هذا حد السو,LY
3,الزن ده هو اكتر حاجة مدمرة للسلام النفسي واحنا بصراحة سلامنا النفسي اهم من كل شيء الحياة قصيرة ومش مستحملة حوارات كتير,EG
4,الكوت كلها ناس بتدور ع علاقات تانيه بس تكون احلي م اللي لسه خالصه اول انبارح,EG


# Train Validation Splitting

In [10]:
df_train, df_valid = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=SEED)

In [12]:
df_train.label.value_counts(), df_valid.label.value_counts()

(EG    46685
 LY    29564
 LB    22369
 SD    11692
 MA    9346 
 Name: label, dtype: int64,
 EG    5187
 LY    3285
 LB    2486
 SD    1299
 MA    1039
 Name: label, dtype: int64)

# Preprocessing

In [13]:
encoder = TextVectorization(max_tokens=50000, ngrams=2)
encoder.adapt(df_train[['tweet']])

In [14]:
VOCAB = encoder.get_vocabulary()

In [15]:
VOCAB_SIZE = len(encoder.get_vocabulary())
VOCAB_SIZE

50000

In [16]:
encoder(df[['tweet']].iloc[0])

<tf.Tensor: shape=(1, 9), dtype=int64, numpy=array([[ 114,    1,  375, 9443,    1,    1,    1,    1,    1]])>

In [17]:
df.tweet.shape

(132952,)

# Modelling

In [18]:
df_train.label.unique().tolist()

['EG', 'LY', 'LB', 'MA', 'SD']

In [19]:
lbl2idx = {k:v for v,k in enumerate(df_train.label.unique().tolist())}
lbl2idx

{'EG': 0, 'LY': 1, 'LB': 2, 'MA': 3, 'SD': 4}

In [20]:
idx2lbl = {k:v for v,k in lbl2idx.items()}
idx2lbl

{0: 'EG', 1: 'LY', 2: 'LB', 3: 'MA', 4: 'SD'}

In [28]:
model0 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    SimpleRNN(64,),
    Dense(64, activation='relu'),
    Dense(5)
])

In [29]:
model0.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, None, 64)          3200000   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 5)                 325       
                                                                 
Total params: 3,212,741
Trainable params: 3,212,741
Non-trainable params: 0
____________________________________________

In [30]:
model0.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [31]:
history0 = model0.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=4, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 256,
  )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [25]:
preds_valid = model0.predict(df_valid.tweet)



In [26]:
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')

0.7575863859999187

# Enhancement

## Bidirectional

In [None]:
model1 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(SimpleRNN(64,)),
    Dense(64, activation='relu'),
    Dense(5)
])

In [None]:
model1.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history1 = model1.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=4, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 1024,
  )

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
preds_valid = model1.predict(df_valid.tweet)
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')



0.7642087491492553

# LSTM

In [None]:
model2 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64,)),
    Dense(64, activation='relu'),
    Dense(5)
])

In [None]:
model2.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history2 = model2.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=3, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 512,
  )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
preds_valid = model2.predict(df_valid.tweet)



In [None]:
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')

0.7801530245421613

### Doubling embedding dimension

In [None]:
# Let's double the embedding dimension!
model3 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64*2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(5)
])

In [None]:
model3.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history3 = model3.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=5, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 1024,
  )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
preds_valid = model3.predict(df_valid.tweet)
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')



0.7733267006280761

### Going deeper

![](https://i.kym-cdn.com/photos/images/newsfeed/000/531/557/a88.jpg)


In [None]:
# Let's double the embedding dimension!
model4 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64*2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64,  return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dense(5)
])

In [None]:
model4.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history4 = model4.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=5, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 256,
  )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
preds_valid = model4.predict(df_valid.tweet)
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')



0.771228564206637

### Trying dropout

In [None]:
# Let's double the embedding dimension!
model5 = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64*2,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64,  return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Bidirectional(LSTM(32, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dense(5)
])



In [None]:
model5.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history5 = model5.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=1, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 128,
  )



In [None]:
preds_valid = model5.predict(df_valid.tweet)
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')



0.7783550778062927

### Results & Conclusion

1. The embedding layers has much more trainable parameters compared to RNN/LSTM
2. The models suffer from overfitting (Train_f1=0.95 valid_f1=0.7)
3. Changing the embedding dimension didn't improve the f1 score much
4. Stacking another LSTM layer also didn't improve the f1 score much

We'll just use the original model since it's less complex

# Final Model

In [None]:
final_model = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64,)),
    Dense(64, activation='relu'),
    Dense(5)
])

In [None]:
final_model.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

## Retraining on entire train + valid

In [None]:
df = pd.concat([df_train, df_valid], axis=0).sample(frac=1)

In [None]:
final_history = final_model.fit(
    x = df.tweet,
    y = df.label.replace(lbl2idx),
    epochs=3, 
    verbose=1, 
    batch_size = 512,
  )

Epoch 1/3
Epoch 2/3
Epoch 3/3

## Assessing on the test set

In [None]:
df_test = pd.read_csv(test_csv_file_path, lineterminator='\n')

In [None]:
preds_valid = final_model.predict(df_test.tweet)



In [None]:
f1_score(df_test.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')

0.7801530245421613

## Saving Model

In [None]:
if using_colab:
  model2.save('bilstm_model')
else:
  model2.save('bilstm_model')



In [None]:
final_model = Sequential([
    Input(shape=(1,),dtype='string'),
    encoder,
    Embedding(
        input_dim=VOCAB_SIZE,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    Bidirectional(LSTM(64,)),
    Dense(64, activation='relu'),
    Dense(5)
])



In [None]:
final_model.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(
    x = df_train.tweet,
    y = df_train.label.replace(lbl2idx),
    epochs=4, 
    verbose=1, 
    validation_data = (df_valid.tweet, df_valid.label.replace(lbl2idx)),
    batch_size = 1024,
  )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
preds_valid = model.predict(df_valid.tweet)



In [None]:
f1_score(df_valid.label.replace(lbl2idx), np.argmax(preds_valid, axis=-1), average='macro')

0.7749578388615694