# Import Required Libraries

In [495]:
import pandas as pd
import json
import re
import string
from tqdm import tqdm
from hazm import Normalizer, sent_tokenize, word_tokenize, Stemmer, Lemmatizer, POSTagger, Chunker, tree2brackets, DependencyParser, stopwords_list

from collections import Counter

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout

import numpy as np

import tensorflow as tf

# Read Json Data

In [496]:
path = "./dataset_annotated_finance.json"

file = open(path)
data = json.load(file)

In [497]:
train = data["train"]
validation = data["eval"]
test = data["test"]

text_column_name = "Text"
label_column_name = "Label"

len(train), len(validation), len(test)

(1450, 75, 75)

In [498]:
def majority_vote(input_list):
    # counter = {
    #     "positive" : 0,
    #     "negative" : 0,
    #     "neutral" : 0,
    # }
    # for vote in input_list:
    #     if "مثبت" in vote:
    #         counter["positive"] += 1
    #     elif "منفی" in vote:
    #         counter["negative"] += 1
    #     else:
    #         counter["neutral"] += 1
    #
    # return max(counter, key=counter.get)
    return Counter(input_list).most_common(1)[0][0]


N = 2
train = filter(lambda sample: len(set(sample["annotations"])) <= N, train)
validation = filter(lambda sample: len(set(sample["annotations"])) <= N, validation)
test = filter(lambda sample: len(set(sample["annotations"])) <= N, test)

train = pd.DataFrame([[sample["text"], majority_vote(sample["annotations"])] for sample in train], columns=[text_column_name, label_column_name])
validation = pd.DataFrame([[sample["text"], majority_vote(sample["annotations"])] for sample in validation], columns=[text_column_name, label_column_name])
test = pd.DataFrame([[sample["text"], majority_vote(sample["annotations"])] for sample in test], columns=[text_column_name, label_column_name])

categories = train[label_column_name].unique()

## Text Pre-Processing

## 1. Remove URLs and HTMLs

In [499]:
url = re.compile(r"https?://\S+|www\.\S+")


def remove_URL(text):
    text = str(text)
    return url.sub(r"", text)


html = re.compile(r"<.*?>")


def remove_HTML(text):
    text = str(text)
    return html.sub(r"", text)

## 2. Remove Emojis

In [500]:
emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE,
)


def remove_emoji(text):
    text = str(text)
    return emoji_pattern.sub(r"", text)

## 3. Remove Punctuations

In [501]:
def remove_punctuations(text):
    text = str(text)
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

## 4. Normalization

In [502]:
normalizer = Normalizer()


def normalize_text(text):
    return normalizer.normalize(text)

## 5. Remove Stopwords

In [503]:
def remove_stopwords(text):
    sws = stopwords_list().copy()
    sws = set(sws)
    sws = sws.union({})
    text = [word.lower() for word in text.split() if word.lower() not in sws]
    return " ".join(text)

## 6. Remove Numbers

In [504]:
def remove_number(text):
    text = str(text)
    result = re.sub(r'\d+', '', text)
    return result

# Pre-processing Pipline

In [505]:
def clean_up_pipeline(data):
    cleaning_functions = [
        str,
        str.strip,
        str.lower,
        remove_number,
        remove_URL,
        remove_HTML,
        remove_emoji,
        remove_punctuations,
        remove_stopwords
    ]

    for function in cleaning_functions:
        data.loc[:, text_column_name] = data[text_column_name].map(function)

    return data


train = clean_up_pipeline(train)
validation = clean_up_pipeline(validation)
test = clean_up_pipeline(test)

## Convert Data Labels to Dummy Variable

to compare with predicted probabilities for each category that model outputs in softmax layer.

In [506]:
train[label_column_name].value_counts()

خنثی               1060
غیر مستقیم مثبت     146
غیر مستقیم منفی      91
مستقیم مثبت          48
مستقیم منفی          34
Name: Label, dtype: int64

In [507]:
validation[label_column_name].value_counts()

خنثی               47
غیر مستقیم مثبت     9
غیر مستقیم منفی     8
مستقیم مثبت         7
Name: Label, dtype: int64

In [508]:
test[label_column_name].value_counts()

خنثی               50
غیر مستقیم مثبت    10
غیر مستقیم منفی     6
مستقیم مثبت         2
مستقیم منفی         2
Name: Label, dtype: int64

In [509]:
train_labels = pd.get_dummies(train[label_column_name].astype(pd.CategoricalDtype(categories=categories)))
validation_labels = pd.get_dummies(validation[label_column_name].astype(pd.CategoricalDtype(categories=categories)))
test_labels = pd.get_dummies(test[label_column_name].astype(pd.CategoricalDtype(categories=categories)))

## Build Train, Validation and Test Dataset
build train, validation and test dataset tensorflow object from modified texts and labels.

In [510]:
BUFFER_SIZE = 256
BATCH_SIZE = 64
MAX_VOCAB_SIZE = 25000

In [511]:
train_dataset = tf.data.Dataset.from_tensor_slices((train[text_column_name].values, train_labels.values)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
validation_dataset = tf.data.Dataset.from_tensor_slices((validation[text_column_name].values, validation_labels.values)).batch(BATCH_SIZE)

# Train Pipeline

## Use Pre-trained FastText Embedding Vectors

download farsi words embedding vectors .bin from [FastText Repository](https://fasttext.cc/docs/en/crawl-vectors.html) and place in working directory. (for persian its 'cc.fa.300.bin'), alternatively mentioned .bin embedding vectors file can be downloaded with the following command (first change directory to directory that fasttext is installed):
``
./download_model.py fa     # Farsi
``

## Load FastText Model

load fasttext model .bin file from disk.

In [512]:
import fasttext
from fasttext.util import reduce_model

# ft = fasttext.load_model('cc.fa.300.bin', )

## Build Embedding Matrix

build embedding matrix using pre-trained fasttext embedding.

In [513]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens=MAX_VOCAB_SIZE)
vectorizer.adapt(train[text_column_name].values)
vocabulary = vectorizer.get_vocabulary()

EMBEDDING_SIZE = ft.get_dimension()

E = np.zeros((len(vocabulary), EMBEDDING_SIZE))
for i, word in enumerate(vocabulary):
    E[i] = ft.get_word_vector(word)

## Build Embedding Layer

build embedding layer with the help of embedding matrix as initialize state. (computed in the previous step)

In [514]:
from keras.initializers import Constant

embedding_layer = Embedding(
    len(vocabulary), EMBEDDING_SIZE,
    embeddings_initializer=Constant(E),
    trainable=True
)

## Define Model Architecture

We use bidirectional lstm layer in this project, because of the text has no order.

Bidirectional long-short term memory(Bidirectional LSTM) is the process of making any neural network to have the sequence information in both directions backwards (future to past) or forward(past to future). 

In [515]:
model = tf.keras.Sequential([
    vectorizer,
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(EMBEDDING_SIZE)),
    tf.keras.layers.Dense(EMBEDDING_SIZE, activation="relu"),
    tf.keras.layers.Dense(5),
    tf.keras.layers.Softmax()
])

model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_20 (Text  (None, None)             0         
 Vectorization)                                                  
                                                                 
 embedding_22 (Embedding)    (None, None, 300)         4787400   
                                                                 
 bidirectional_18 (Bidirecti  (None, 600)              1442400   
 onal)                                                           
                                                                 
 dense_42 (Dense)            (None, 300)               180300    
                                                                 
 dense_43 (Dense)            (None, 5)                 1505      
                                                                 
 softmax_18 (Softmax)        (None, 5)               

## Compile Model

because of imbalanced weights of categories in label column, we use AUC (Area under the Curve and F1) & F1-Score as metrics for model performance evaluation.

In [516]:
from keras import backend as K

def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def F1(y_true, y_pred):
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision*recall) / (precision + recall + K.epsilon()))


In [517]:
from keras.metrics import CategoricalAccuracy, AUC
from keras.losses import CategoricalCrossentropy, SparseCategoricalCrossentropy
from keras.optimizers import Adam, RMSprop, Nadam

model.compile(loss=CategoricalCrossentropy(from_logits=True, name="CCELoss"),
              optimizer=Nadam(learning_rate=1e-4),
              metrics=[CategoricalAccuracy(name="Accuracy"), AUC(name="AUC"), F1])


## Class Weights

weights of categories in training dataset are totally imbalanced therefore it is necessary to set class weight in training phase.

The 'balanced' heuristic is inspired by Logistic Regression in Rare Events Data, King, Zen, 2001.

In [518]:
from sklearn.utils import class_weight

labels_codes = np.argmax(train_labels.values, axis=1)
weights = class_weight.compute_class_weight(class_weight='balanced',
                                            classes=np.unique(labels_codes),
                                            y=labels_codes)

class_weights = {}
for class_index, weight in enumerate(weights):
    class_weights[class_index] = weight

## Train, Validation Phase

In [519]:
import mlflow

mlflow.tensorflow.autolog(log_models=False)

history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    class_weight=class_weights,
    epochs=10,
    workers=4,
    use_multiprocessing=True
)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




## Test Phase

In [520]:
model.evaluate(test[text_column_name].values, test_labels.values)



[1.2480436563491821,
 0.6571428775787354,
 0.8764795660972595,
 0.6249104142189026]

In [521]:
predicted_probabilities_test_labels = model.predict(validation[text_column_name].values)
predicted_test_labels = np.argmax(predicted_probabilities_test_labels, axis=1)

