# Mount Google Drive

Due to convenience and fast downloading, I upload the raw data and fast-text .bin to google drive. Mounting these resources are as follows:


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Required Libraries

In [3]:
!pip3 --no-cache-dir install dadmatools numpy pandas matplotlib plotly scikit-learn hazm wordcloud_fa nltk wandb tensorflow tensorflow-addons

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dadmatools
  Downloading dadmatools-1.5.2-py3-none-any.whl (862 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m862.6/862.6 KB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.7/316.7 KB[0m [31m329.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wordcloud_fa
  Downloading wordcloud_fa-0.1.10-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.4/71.4 KB[0m [31m289.5 MB/s[0m eta [36m0:00:00[0m
Collecting wandb
  Downloading wandb-0.13.9-py2.py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m332.4 MB/s[0m eta [36m0:00:00[0m
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux

# Import Required Functionalities 





In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import json
import re
import string
from tqdm import tqdm
from hazm import Normalizer, sent_tokenize, word_tokenize, Stemmer, Lemmatizer, POSTagger, Chunker, tree2brackets, DependencyParser, stopwords_list

from collections import Counter

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

import tensorflow as tf


# Read Raw Data

In [5]:

labelled_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SAMousavizade/data_labelled.csv")
unlabelled_data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/SAMousavizade/data_unlabelled.csv")

labelled_data

Unnamed: 0,id,text,label
0,131244574,عالی عالی عالی عالی عالی عالی عالی عالی عالی ع...,0
1,133141894,دوستان این نظرات و پیشنهادات رو باید به پشتیبا...,0
2,94129372,خیلی ایراد داره مسخره تر از این نمیشه رقیب‌هات...,1
3,131334220,نه کی گفته خرابه من دارم باهاش کار میکنم از من...,0
4,131387172,سلام عالیه حتما نصب کنید از کالاف دیوتی هم بهتره,0
...,...,...,...
3591,94229465,همه رشته ها نداره مثلا معارف سوالات تخصصي ندار...,1
3592,131571104,خیلی بده من اصلا. دوست ندارم خواهش می کنم دانل...,0
3593,132784715,بهترین برنا مه ای که دیدم خیلی باهاله میتونی ت...,0
3594,131981378,خیلی بازی مسخره هس نصب نکنید ۱ستاره هم براش زیاده,0


# Preprocess Raw Data

In this section, I preprocess raw-text data. Text preprocessing steps are as follows:

- Unifying all variants of characters (like "ی" and "ي")
- Remove extra spaces between tokens 
- Remove punctuations(like !, ., ?, etc)
- Remove HTML tags  
- Remove all emails, phone numbers, URLs, emojis
- Remove stop-words 
-  Refine any characters being repeated more than 2 times in the tokens (like "عااااااااالیه" to "عاالیه"
- Lemmatization (grouping the inflected forms of a word so they can be analyzed as a single item.)

apply this steps on training data.


In [6]:
from dadmatools.models.normalizer import Normalizer

normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=True,
    remove_html=True,
    remove_stop_word=True,
    replace_email_with="",
    replace_number_with="",
    replace_url_with="",
    replace_mobile_number_with="",
    replace_emoji_with="",
    replace_home_number_with=""
)

labelled_data["normalized_text"] = labelled_data["text"].apply(lambda text: normalizer.normalize(text))
labelled_data["normalized_text"] = labelled_data["normalized_text"].replace(r'[^آ-یA-Za-z0-9 ]+', '', regex=True)
labelled_data["normalized_text"] = labelled_data["normalized_text"].replace(r'(.)\1{2,}', '', regex=True)

unlabelled_data["normalized_text"] = unlabelled_data["text"].apply(lambda text: normalizer.normalize(text))
unlabelled_data["normalized_text"] = unlabelled_data["normalized_text"].replace(r'[^آ-یA-Za-z0-9 ]+', '', regex=True)
unlabelled_data["normalized_text"] = unlabelled_data["normalized_text"].replace(r'(.)\1{2,}', '', regex=True)

apply previously discussed preprocessing steps on test data. 

In [8]:
test_case_data = pd.DataFrame({"text": [
    "!!!!سلام برنامه خوبیه جدا",
    "لود نمیشه اصلا!! :((((",
    "پولم رو پس نمیدید چرا؟؟؟",
    "بازی جالبیه.",
    "خیلییییی لگ داره روی گوشیم.",
    "معتاد این بازی شدم.",
    "خیلی باگ داره اعصابو خورد کرده.",
    "بازی توی مرحله اول گیر کرده و به مرحله بعدی نمیره اصلا! :(((",
    "آقا عالیه!!!!",
    "موقع نصب به مشکل میخوره. اه.",
    "آشغااااااااااااااله",
    "افتضاحهههههه.",
    "مزخرفه.",
    "همش باگ میخورههههههههههههههههههههه.",
    "برای بچه ها مشکل داره این بازی. لطفا اینو ذکر کنید."
]})

test_case_data["normalized_text"] = test_case_data["text"].apply(lambda text: normalizer.normalize(text))
test_case_data["normalized_text"] = test_case_data["normalized_text"].replace(r'[^آ-یA-Za-z0-9 ]+', '', regex=True)
test_case_data["normalized_text"] = test_case_data["normalized_text"].replace(r'(.)\1{2,}', '', regex=True)

test_case_data

Unnamed: 0,text,normalized_text
0,!!!!سلام برنامه خوبیه جدا,سلام برنامه خوبیه
1,لود نمیشه اصلا!! :((((,لود نمیشه
2,پولم رو پس نمیدید چرا؟؟؟,پولم نمیدید
3,بازی جالبیه.,بازی جالبیه
4,خیلییییی لگ داره روی گوشیم.,خیل لگ داره گوشیم
5,معتاد این بازی شدم.,معتاد بازی
6,خیلی باگ داره اعصابو خورد کرده.,باگ داره اعصابو خورد
7,بازی توی مرحله اول گیر کرده و به مرحله بعدی نم...,بازی مرحله مرحله بعدی نمیره
8,آقا عالیه!!!!,عالیه
9,موقع نصب به مشکل میخوره. اه.,موقع نصب مشکل میخوره اه


In [9]:
from hazm import Normalizer, sent_tokenize, word_tokenize, Stemmer, Lemmatizer, POSTagger, Chunker, tree2brackets, DependencyParser, stopwords_list

lemmatizer = Lemmatizer()

labelled_data["lemmatized_tokens"] = labelled_data["normalized_text"].apply(lambda text: " ".join(list(map(lemmatizer.lemmatize, word_tokenize(text)))))
unlabelled_data["lemmatized_tokens"] = unlabelled_data["normalized_text"].apply(lambda text: " ".join(list(map(lemmatizer.lemmatize, word_tokenize(text)))))
test_case_data["lemmatized_tokens"] = test_case_data["normalized_text"].apply(lambda text: " ".join(list(map(lemmatizer.lemmatize, word_tokenize(text)))))

# Create Tensorflow Dataset
Create train, validation and test dataset tensorflow object from pre-processed texts and labels.

The tf.data.Dataset API supports writing descriptive and efficient input pipelines. Dataset usage follows a common pattern: 

1. Create a source dataset from your input data.
2. Apply dataset transformations to preprocess the data.
3. Iterate over the dataset and process the elements.
Iteration happens in a streaming fashion, so the full dataset does not need to fit into memory.

# Define Hyperparameters

In [10]:
LEARNING_RATE = 3e-5
INFORMATIVE_CLASS_WEIGHT = 5.0
EPOCHS = 50
BATCH_SIZE = 25
EMBEDDING_SIZE = 100

BUFFER_SIZE = 512
MAX_VOCAB_SIZE = 25000

## Split Data to Train and Validation

Split preprocessed data to train and validation by the proportion of 0.1.

In [11]:
from sklearn.model_selection import train_test_split
  
train, validation = train_test_split(
    labelled_data,
    random_state=104, 
    test_size=0.1, 
    shuffle=True
)

# Convert Data Labels to Dummy Variable

To compare with predicted probabilities for each category that the model outputs in the softmax layer, I need to transform the label column to dummy variables.

In [12]:
text_column_name = "lemmatized_tokens"
label_column_name = "label"

categories = ["Non-Informative", "Informative"]
train_labels = pd.get_dummies(train[label_column_name])
validation_labels = pd.get_dummies(validation[label_column_name])

## Transform Data To Tensorflow Dataset

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices( (train[text_column_name].tolist(), train_labels) ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = tf.data.Dataset.from_tensor_slices( (validation[text_column_name].values, validation_labels) ).batch(BATCH_SIZE)

# Train Pipeline

## Use Pre-trained FastText Embedding Vectors
download farsi word embedding vectors in format of `.bin` file from **[FastText Repository](https://fasttext.cc/docs/en/crawl-vectors.html)** and place in working directory. (for persian its 'cc.fa.300.bin'), alternatively mentioned .bin embedding vectors file can be downloaded with the following command (first change directory to directory that fasttext is installed):

> ./download_model.py fa # farsi




# FastText Installation 

In [14]:
!rm -rf fastText
!git clone https://github.com/facebookresearch/fastText.git
%cd ./fastText/
!sudo pip install .

%cd /content

Cloning into 'fastText'...
remote: Enumerating objects: 3930, done.[K
remote: Counting objects: 100% (944/944), done.[K
remote: Compressing objects: 100% (140/140), done.[K
remote: Total 3930 (delta 854), reused 804 (delta 804), pack-reused 2986[K
Receiving objects: 100% (3930/3930), 8.24 MiB | 22.33 MiB/s, done.
Resolving deltas: 100% (2505/2505), done.
/content/fastText
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing /content/fastText
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2
  Using cached pybind11-2.10.3-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp38-cp38-linux_x86_64.whl size=4386484 sha256=c359de7dceebe1caf6b4d14ab4d050eb68060ea1c68c79119a2c5c99a27319f3
  Stored in directory: /tmp/pip-ephem-wheel-cache-newt8qyb/wheels/a4/2f/6a/91d479

Then verify the installation went well:

In [15]:
import fasttext

Note: Jump to *Load Fine-Tuned FastText Language Model (Word Embeddings)* section if fine-tuned FastText language model is prepared previously.

---

# Download Original FastText Embeddings
Download original fasttext embeddings in `.bin` file. its about 4.2Gb!

In [16]:
# from fasttext.util import reduce_model

# fasttext.util.download_model('fa', if_exists='ignore')  # farsi

# Adapt The Dimension of Word Embeddings

The pre-trained word vectors that distribute by FastText have a dimension of 300. I need a smaller size, So I use the dimension reducer, which is implemented in this package. This dimension reduction is done using the **PCA** algorithm. 

In this project, I reduce the dimension of word embeddings to 100 as follows:

In [17]:
# import fasttext
# from fasttext.util import reduce_model

# ft = fasttext.load_model(f'/content/cc.fa.{300}.bin', )
# print("model loaded ...")

# reduce_model(ft, EMBEDDING_SIZE)
# print("embedding dimension reduced ...")

# ft.save_model(f"/content/cc.fa.{EMBEDDING_SIZE}.bin", )
# print("reduced model saved ...")

### (Optional) Copy/Load Reduced Embeddings To/From Drive

In [18]:
########################################################################
#### Embedding Size: 100
########################################################################
# !cp /content/cc.fa.100.bin /content/drive/MyDrive/FastText

########################################################################
#### Embedding Size: 300
########################################################################
# !cp /content/cc.fa.300.bin /content/drive/MyDrive/FastText

and load it from drive.

In [19]:
# ft = fasttext.load_model(f'/content/drive/MyDrive/FastText/cc.fa.{EMBEDDING_SIZE}.bin', )
# print("model loaded ...")

# Unsupervised Fine Tune Language Model (FastText Embeddings)

In this section, I fine-tune FastText language model (word embeddings) unsupervised-ly using unlabelled data.



## Prepare (previously preprocessed) Unlabelled Data Texts

Prepare unlabelled texts in `unlabelled_texts.txt`. (Each line contains one unlabelled text) that will be used in FastText embedding fine-tuning on this special context. 



Write unlabelled texts line by line in mentioned `.txt` file 

In [20]:
# with open('/content/unlabelled_texts.txt', 'w') as file:
#     unlabelled_data["lemmatized_tokens"].apply(lambda text: file.write(f"{text}\n"))
    

Generate .vec file from .bin file that is downloaded from FastText repo.  

In [21]:
# def generate_vec_file_from_bin_file(fasttext_model, output_path_filename):
    
#     # get all words from model
#     words = fasttext_model.get_words()

#     with open(output_path_filename,'w') as file_out:
        
#         # the first line must contain number of total words and vector dimension
#         file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + "\n")

#         # line by line, you append vectors to VEC file
#         for w in words:
#             v = fasttext_model.get_word_vector(w)
#             vstr = ""
#             for vi in v:
#                 vstr += " " + str(vi)

#                 # to reduce .vec file volume
#                 # vstr += " " + "{:.4f}".format(vi)
#             try:
#                 file_out.write(w + vstr+'\n')
#             except:
#                 pass


# generate_vec_file_from_bin_file(ft, output_path_filename=f'/content/cc.fa.{EMBEDDING_SIZE}.vec')

# Fine Tune FastText Language Model (Word Embeddings)

Fine tune general FastText language model on current special contexts using unlabelled texts.

In [22]:
# ft = fasttext.train_unsupervised(
#     input='/content/unlabelled_texts.txt',
#     pretrainedVectors=f"/content/drive/MyDrive/FastText/cc.fa.{EMBEDDING_SIZE}.vec",
#     dim=EMBEDDING_SIZE,
#     verbose=True
# )

save fine-tuned language model.

In [23]:
# ft.save_model(f"cc.fa.{EMBEDDING_SIZE}_fine_tuned.bin")

copy to drive for nexts usages.

In [24]:
# !cp /content/cc.fa.{EMBEDDING_SIZE}_fine_tuned.bin /content/drive/MyDrive/FastText

--- 

# Load Fine-Tuned FastText Language Model (Word Embeddings) 

In [25]:
ft = fasttext.load_model(f'/content/drive/MyDrive/FastText/cc.fa.{EMBEDDING_SIZE}_fine_tuned.bin', )

print("model loaded ...")

model loaded ...


# Create Embedding Matrix
Create embedding matrix using pre-trained fasttext embedding that loaded previously.

In [26]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens=MAX_VOCAB_SIZE)
vectorizer.adapt(labelled_data[text_column_name].values)
vocabulary = vectorizer.get_vocabulary()

E = np.zeros((len(vocabulary), EMBEDDING_SIZE))
for i, word in enumerate(vocabulary):
    E[i] = ft.get_word_vector(word)

# Create Embedding Layer
Create embedding layer with the help of embedding matrix (that created in previous step.) as initial state of this layer and set `trainable=True` to enhance embeddings during sequence classification supervised learning.

In [27]:
from keras.initializers import Constant

embedding_layer = Embedding(
    len(vocabulary), EMBEDDING_SIZE,
    embeddings_initializer=Constant(E),
    trainable=True
)

# Define Model Architecture
I use multi-layer bi-directional lstm layer in the model architecture because the text has no order and its not a generative task.
 
**Bidirectional Long-Short Term Memory(LSTM)** is the process of making any neural network to have the sequence information in both directions backwards (future to past) or forward(past to future).

In [28]:
n_class = len(categories)
model = tf.keras.Sequential([
    vectorizer,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE, return_sequences=True), merge_mode="ave"),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE), merge_mode="ave"),
    tf.keras.layers.Dense(EMBEDDING_SIZE, activation="relu"),
    tf.keras.layers.Dense(n_class),
    tf.keras.layers.Softmax()
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, None)             0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, None, 100)         689200    
                                                                 
 bidirectional (Bidirectiona  (None, None, 100)        160800    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100)              160800    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                        

# Compile Model

Because of **imbalanced weights of categories** in the label column, I use **ROC-AUC** (Area under the curve of ROC diagram) and **Precision & Recall** as metrics for model performance evaluation. 

Also due to the different importance of *Informative* category relative to *Non-Informative* in this context, I use the **Weighted Cross-Entropy Loss** trick described in the following section.


## Weighted Cross-Entropy Loss

This is like traditional cross-entropy loss except that the weight term $w$ , allows one to trade off recall and precision by up- or down-weighting the cost of a positive error relative to a negative error.

A value $w$ > 1 decreases the false negative count, hence increasing the recall. Conversely setting $w$ < 1 decreases the false positive count and increases the precision. This can be seen from the fact that $w$ is introduced as a multiplicative coefficient for the positive labels term in the loss expression:

$$
  −(w.y \log(p)+(1−y)\log(1−p))
$$

Due to the different importance of *Informative* category relative to *Non-Informative*, I set $w$ hyperparameter to 5. (`INFORMATIVE_CLASS_WEIGHT=5`)

In [29]:
from keras.metrics import CategoricalAccuracy, AUC, Recall, RecallAtPrecision, Precision, PrecisionAtRecall, CategoricalCrossentropy
from keras.losses import BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
import tensorflow_addons as tfa
from keras.optimizers import Nadam
from tensorflow.nn import weighted_cross_entropy_with_logits
from keras import backend as K

def WeightedCrossEntropy(labels, logits):
    labels = tf.cast(labels, logits.dtype)
    wce = weighted_cross_entropy_with_logits(labels[:, 1], logits[:, 1], INFORMATIVE_CLASS_WEIGHT, name="weighted_cross_entropy")
    return tf.reduce_mean(wce, axis=-1)  # Note the `axis=-1`


Compile model with the following:
- Loss Criteria: Weighted Cross Entropy 
- Optimizer: Nadam 
- metrics: accuracy, roc-auc, recall and precision

In [30]:
model.compile(loss=WeightedCrossEntropy,
              optimizer=Nadam(learning_rate=LEARNING_RATE),
              metrics=[
                  CategoricalAccuracy(name="accuracy"),
                  AUC(name="ROC-AUC"),
                  Recall(name="recall", class_id=1),
                  Precision(name="precision", class_id=1),
              ]
)

# Login to WandB 

In [31]:
import os

os.environ["WANDB_API_KEY"] = "1d6bdaf3f9f088abf0915e5e5cb6689e4c7e7476"
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msamousavizade[0m ([33mcausal-inference[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Initialize WandB Client

In [32]:
LEARNING_RATE = LEARNING_RATE
INFORMATIVE_CLASS_WEIGHT = INFORMATIVE_CLASS_WEIGHT
BATCH_SIZE = BATCH_SIZE
EPOCHS = EPOCHS
EMBEDDING_SIZE = EMBEDDING_SIZE

BUFFER_SIZE = BUFFER_SIZE
MAX_VOCAB_SIZE = MAX_VOCAB_SIZE 

wandb.init(
  project="CoffeeBazaarSeqClassification ",
  entity="samousavizade",
  name=f"bi-LSTM LR:{LEARNING_RATE} B:{BATCH_SIZE} W:{INFORMATIVE_CLASS_WEIGHT} E:{EMBEDDING_SIZE}",
  config={
      "learning_rate": LEARNING_RATE,
      "informative_class_weight": INFORMATIVE_CLASS_WEIGHT,
      "batch_size": BATCH_SIZE,
      "epochs": EPOCHS,

      "embedding_size": EMBEDDING_SIZE,
      "buffer_size": BUFFER_SIZE,
      "max_vocab_size": MAX_VOCAB_SIZE
  })

config = wandb.config

from wandb.keras import WandbMetricsLogger, WandbModelCheckpoint

metric_logger = WandbMetricsLogger(log_freq="epoch")
model_checkpoint = WandbModelCheckpoint(filepath="my_model_{epoch:02d}", monitor="val_loss", mode="min", verbose=0, save_best_only=True, save_freq=10)

[34m[1mwandb[0m: Currently logged in as: [33msamousavizade[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Train and Validation Phase

In [33]:
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=EPOCHS,
    workers=4,
    use_multiprocessing=True,
    callbacks=[metric_logger,]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Terminate WandB Session

In [34]:
wandb.finish()

0,1
epoch/ROC-AUC,▁▂▅▇▇▇▇▇█████████▇██████████████████████
epoch/accuracy,▁▁▅▇▇▇▇▇▇▇█▇▇▇▇▇█▇██████████████████████
epoch/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
epoch/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/loss,█▇▆▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch/precision,▁▁▄▆▆▆▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▇██▇█▇██████████
epoch/recall,▅█▃▁▁▂▂▂▃▂▂▃▃▃▃▃▃▃▃▃▃▄▃▃▃▃▃▄▄▄▄▄▄▄▄▄▄▃▄▄
epoch/val_ROC-AUC,▁▂▆▇█▇▇▇▇▇▇▇▆█▇▇▇██▇▇▇▇▇█▇▇▇▇█▇▇▇▇▇█▇▇▇▇
epoch/val_accuracy,▁▃▇██▇█▇███▇▇██▇███▇▇▇████████▇██▇██▇███
epoch/val_loss,█▇▄▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▂▂▁▁▁▂

0,1
epoch/ROC-AUC,0.79221
epoch/accuracy,0.77627
epoch/epoch,49.0
epoch/learning_rate,3e-05
epoch/loss,1.15731
epoch/precision,0.60013
epoch/recall,0.88207
epoch/val_ROC-AUC,0.76147
epoch/val_accuracy,0.74167
epoch/val_loss,1.20279


# Test Phase

In [35]:
for i in range(test_case_data.shape[0]):
  test_case_text = test_case_data["text"][i]
  preprocessed_test_case = test_case_data["lemmatized_tokens"][i]
  
  print("Test Case Text:")
  print(test_case_text)
  print("Prediction:")
  print(model.predict([preprocessed_test_case, ]))
  print("*" * 80)

Test Case Text:
!!!!سلام برنامه خوبیه جدا
Prediction:
[[0.7829611  0.21703894]]
********************************************************************************
Test Case Text:
لود نمیشه اصلا!! :((((
Prediction:
[[0.2591255 0.7408745]]
********************************************************************************
Test Case Text:
پولم رو پس نمیدید چرا؟؟؟
Prediction:
[[0.4458921  0.55410796]]
********************************************************************************
Test Case Text:
بازی جالبیه.
Prediction:
[[0.71512353 0.28487647]]
********************************************************************************
Test Case Text:
خیلییییی لگ داره روی گوشیم.
Prediction:
[[0.8958438  0.10415614]]
********************************************************************************
Test Case Text:
معتاد این بازی شدم.
Prediction:
[[0.7261366  0.27386335]]
********************************************************************************
Test Case Text:
خیلی باگ داره اعصابو خورد کرده.
Predictio