# Language Models evaluated on the data of Talafeef

## Trained, tested, evaluated models using classification metrics
  - CRF Model: (crf_model.sav)
  - LSTM (RNN) Model: (RNN-model.h5, RNN_tag2index.pkl, RNN_word2index)

## Word embeddings using Skip-Gram, CBOW, and araBERTv02:
  - Skip-Gram Model: (SkipGram_model.pt)
  - CBOW Model: (cbow_model.h5 and CBOW_Embeddings.npz)
  - araBERTv02 (source: https://huggingface.co/aubmindlab/bert-base-arabertv2): (token_vecs_cat_array.pkl, token_vecs_cat_array.npz, tokenized_text.pkl)

## Fine-tuned pre-trained araBERTv02 Model on the Project-specific Data
  - fine_tuned_arabertv02.zip

In [None]:
!pip install lazyme

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lazyme
  Downloading lazyme-0.0.27.tar.gz (6.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lazyme
  Building wheel for lazyme (setup.py) ... [?25l[?25hdone
  Created wheel for lazyme: filename=lazyme-0.0.27-py3-none-any.whl size=8606 sha256=8eb3a44cec218cb1c5d076920621b46d6003ed030abe2778949f3aabeb458ad8
  Stored in directory: /root/.cache/pip/wheels/69/d3/f1/ba84128432d5b0c2408d2ae86ea09d743e28e4d50aca31d39e
Successfully built lazyme
Installing collected packages: lazyme
Successfully installed lazyme-0.0.27


# 1. 1. Random Fields (CRF)

In [None]:
from lazyme import per_section
import nltk 
from collections import Counter
import numpy as np
import pandas as pd

In [None]:
dataset = pd.read_csv("/content/Talafeef.csv",usecols=['token','tpos'])
dataset.head()
dataset.isnull().sum()

token    0
tpos     0
dtype: int64

In [None]:
dataset.isnull().sum()

token    0
tpos     0
dtype: int64

In [None]:
dataset['tpos'].apply(lambda x:type(x)).value_counts()

<class 'str'>    61317
Name: tpos, dtype: int64

In [None]:
arabic_period_count = dataset.query('token=="."').shape[0]
print(f"Number of Periods: {arabic_period_count}")

Number of Periods: 1412


In [None]:
#end of the sentences
end_sent_index = dataset.query('token=="."').index

In [None]:
# Splitting the data into sentences using the period (.) as the unique identifier
sentences = []
sentence = []
for index, row in dataset.iterrows():
    sentence.append((row['token'], row['tpos']))  # Add both token and tpos as a tuple
    if row['token'] == '.':
        sentences.append(sentence)
        sentence = []

In [None]:
arabic_comma_sentences = [s for s in sentences if any(t[0] == '،' for t in s)]
print(arabic_comma_sentences[:10])

[[('مثال', 'N'), ('الصورة', 'N'), ('34', 'CD'), (':', 'PUNC'), ('صورتان', 'N'), ('شعاعيتان', 'N'), ('متطابقتان', 'JJ'), ('ل', 'IN'), ('القسم', 'N'), ('العلوي', 'JJ'), ('من', 'IN'), ('البطن', 'N'), ('تظهران', 'VBP'), ('غاز', 'N'), ('في', 'IN'), ('الشجرة', 'N'), ('الصفراوية', 'JJ'), ('،', 'PUNC'), ('يوجد', 'VBP'), ('تفرع', 'N'), ('خطوط', 'N'), ('سوداء', 'JJ'), ('غاز', 'N'), ('بارز', 'JJ'), ('نحو', 'N'), ('مركز', 'N'), ('الكبد', 'N'), ('أكبر', 'JJR'), ('و', 'CC'), ('أشد', 'JJR'), ('بروزا', 'N'), ('نحو', 'N'), ('السرة', 'N'), ('،', 'PUNC'), ('و', 'CC'), ('يوجد', 'VBP'), ('أيضا', 'N'), ('شبكة', 'N'), ('صفراوية', 'JJ'), ('واقعة', 'JJ'), ('على', 'IN'), ('الخط', 'N'), ('الناصف', 'JJ'), ('الأسهم', 'N'), ('البيضاء', 'N'), ('و', 'CC'), ('التي', 'WP'), ('تتوضع', 'VBP'), ('في', 'IN'), ('داخل', 'JJ'), ('القناة', 'N'), ('الصفراوية', 'JJ'), ('المشتركة', 'JJ'), ('و', 'CC'), ('هذا', 'DT'), ('يفسر', 'VBP'), ('أين', 'WRB'), ('يستطيع', 'VBP'), ('الغاز', 'N'), ('أن', 'RP'), ('ينتقل', 'VBP'), ('ب', 'IN'), ('

In [None]:
print(len(sentences))

1412


In [None]:
dataset_size = np.arange(len(sentences))
train_index = np.random.choice(dataset_size, int(len(sentences) * 0.8), replace=False)
test_index = np.setdiff1d(dataset_size, train_index)

In [None]:
train_index.shape

(1129,)

In [None]:
test_index.shape

(283,)

In [None]:
training_sentences = [sentences[i] for i in train_index]
test_sentences = [sentences[i] for i in test_index]

In [None]:
print (training_sentences[0])

[('اللهم', 'N'), ('من', 'RP'), ('أراد', 'VBD'), ('بلادنا', 'N'), ('و', 'CC'), ('ديننا', 'N'), ('و', 'CC'), ('عقيدتنا', 'N'), ('و', 'CC'), ('أمننا', 'N'), ('ب', 'IN'), ('سوء', 'N'), ('و', 'CC'), ('كيد', 'N'), ('ف', 'CC'), ('اردد', 'VBP'), ('كيده', 'N'), ('في', 'IN'), ('نحره', 'N'), ('و', 'CC'), ('أشغله', 'VB'), ('ب', 'IN'), ('نفسه', 'N'), ('يا', 'RP'), ('رب', 'N'), ('العالمين', 'N'), ('.', 'PUNC')]


In [None]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        
        'prefix-1': sentence[index][0],
        #'prefix-2': sentence[index][:2],
        #'prefix-3': sentence[index][:3],
        #'prefix-4': sentence[index][:4],
        'suffix-1': sentence[index][-1],
        #'suffix-2': sentence[index][-2:],
        #'suffix-3': sentence[index][-3:],

        'prev_word1': '' if index == 0 else sentence[index - 1],
        
        'next_word1': '' if index == len(sentence) - 1 else sentence[index + 1],        
        
        'is_numeric': sentence[index].isdigit(),
    }

In [None]:
from nltk.tag.util import untag

def transform_to_dataset(tagged_sentences):
    X, y = [], []

    for tagged in tagged_sentences:
        try:
            X.append([features(untag(tagged), index) for index in range(len(tagged))])
            y.append([tag for _, tag in tagged])
        except ValueError:
            print(f"Problematic sentence: {tagged}")

    return X, y

X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

print(len(X_train))
print(len(X_test))

1129
283


In [None]:
!pip install sklearn_crfsuite

import time
start_time = time.time()

from sklearn_crfsuite import CRF
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
 
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
    c1=0.01,
    c2=0.01,
    max_iterations=100000,
    all_possible_transitions=True)
crf.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn_crfsuite-0.3.6
--- 38.61729884147644 seconds ---


In [None]:
labels = list(crf.classes_)
labels

['N',
 'RP',
 'VBD',
 'CC',
 'IN',
 'VBP',
 'VB',
 'PUNC',
 'CD',
 'ABBREV',
 'FW',
 'JJ',
 'DT',
 'RB',
 'JJR',
 'WP',
 'PRP',
 'WRB',
 'VBN']

In [None]:
y_pred = crf.predict(X_test)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
a = mlb.fit_transform([crf.classes_])
y_test_matrix = mlb.transform(y_test)
y_pred_matrix = mlb.transform(y_pred)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test_matrix, y_pred_matrix, target_names=crf.classes_, digits=3))

              precision    recall  f1-score   support

           N      1.000     0.867     0.929        30
          RP      0.996     1.000     0.998       239
         VBD      0.981     1.000     0.990        52
          CC      1.000     0.989     0.994        88
          IN      1.000     0.800     0.889         5
         VBP      1.000     1.000     1.000       256
          VB      0.892     0.900     0.896       211
        PUNC      0.952     0.833     0.889        48
          CD      1.000     1.000     1.000       280
      ABBREV      1.000     0.977     0.989        44
          FW      1.000     1.000     1.000       283
          JJ      0.969     0.969     0.969        97
          DT      0.974     0.995     0.984       186
          RB      0.900     0.692     0.783        26
         JJR      0.912     0.864     0.887       132
          WP      0.500     0.176     0.261        17
         PRP      0.925     0.961     0.943       206
         WRB      0.900    

In [None]:
import time
start_time = time.time()

from sklearn_crfsuite import metrics
y_pred = crf.predict(X_test)
print("Accuracy = ",metrics.flat_accuracy_score(y_test, y_pred))
print ("Recall =", metrics.flat_recall_score(y_test, y_pred, average='weighted',  labels=labels))
print ("Precision =", metrics.flat_precision_score(y_test, y_pred, average='weighted',  labels=labels))
print ("F1 =", metrics.flat_f1_score(y_test, y_pred, average='weighted',  labels=labels))

print("--- %s seconds ---" % (time.time() - start_time))

Accuracy =  0.9262727272727272
Recall = 0.9262727272727272
Precision = 0.9241297653976086
F1 = 0.923770190122457
--- 0.19359421730041504 seconds ---


In [None]:
y_pred = crf.predict(X_test)
print ("F1 =", metrics.flat_f1_score(y_test, y_pred, average='weighted',  labels=labels))

F1 = 0.923770190122457


In [None]:
sentence = 'الله الرحمن الرحيم .'.split()
def pos_tag(sentence):
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, crf.predict([sentence_features])[0]))
 
print(pos_tag(sentence))

[('الله', 'N'), ('الرحمن', 'N'), ('الرحيم', 'JJ'), ('.', 'PUNC')]


# 1. 2. Save CRF

In [None]:
import pickle
filename = 'crf_model.sav'
pickle.dump(crf, open(filename, 'wb'))

In [None]:
import joblib
# Fit the CRF model
crf.fit(X_train, y_train)
# Save the CRF model
joblib.dump(crf, '/content/crf_model.sav')

['/content/crf_model.sav']

In [None]:
import pickle

# Save the CRF model
with open('/content/crf_model.pkl', 'wb') as f:
    pickle.dump(crf, f)

# Load the CRF model
with open('/content/crf_model.pkl', 'rb') as f:
    crf_model = pickle.load(f)

# Hyperparameter tuning using grid Search for CRF

# 2. 1. نموذج الشبكات العصبية المتكررة (شبكات الذَّاكرة القصيرة-الطويلة المدى)  
## RNN (LSTM)

In [None]:
!pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding, Activation
from tensorflow.keras.optimizers import Adam

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# Read the dataset
dataset = pd.read_csv("/content/Talafeef.csv", usecols=['token', 'tpos'])

# Process the dataset to extract sentences and tags
sentences, tags = [], []
sentence, tag = [], []
for _, row in dataset.iterrows():
    if row["token"] == '.':
        sentences.append(sentence)
        tags.append(tag)
        sentence, tag = [], []
    else:
        sentence.append(row["token"])
        tag.append(row["tpos"])

# Add the last sentence if it doesn't end with a period
if len(sentence) > 0:
    sentences.append(sentence)
    tags.append(tag)

# Print the number of sentences
print("Number of sentences:", len(sentences))

# Get the first sentence and the first word and tag
if len(sentences) > 0 and len(sentences[0]) > 0:
    first_sentence = sentences[0]
    first_word, first_tag = first_sentence[0], tags[0][0]
    print("First word:", first_word)
    print("First tag:", first_tag)
else:
    print("No sentences found in the dataset.")


Number of sentences: 1412
First word: إذا
First tag: RP


In [None]:
sentences, sentence_tags = [], []
sentence, tags = [], []
for index, row in dataset.iterrows():
    sentence.append(row['token'])
    tags.append(row['tpos'])
    if row['token'] == '.':
        sentences.append(np.array(sentence))
        sentence_tags.append(np.array(tags))
        sentence, tags = [], []

In [None]:
sentences[2]

array(['و', 'حاليا', 'ف', 'إن', 'معظم', 'المرضى', 'غير', 'المستقرين',
       'الذين', 'يشك', 'لديهم', 'ب', 'استرواح', 'الصفاق', 'يخضعون',
       'مباشرة', 'ل', 'التصوير', 'المقطعي', 'المحوسب', '.'], dtype='<U9')

In [None]:
sentence_tags[2]

array(['CC', 'N', 'CC', 'RP', 'JJ', 'N', 'N', 'JJ', 'WP', 'VBP', 'RB',
       'IN', 'N', 'N', 'VBP', 'JJ', 'IN', 'N', 'JJ', 'JJ', 'PUNC'],
      dtype='<U4')

In [None]:
len(sentences[1]), len(sentence_tags[1000])

(26, 56)

In [None]:
(train_sentences, 
 test_sentences, 
 train_tags, 
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

In [None]:
words, tags = set([]), set([])
 
for s in train_sentences:
    for w in s:
        words.add(w)
 
for ts in train_tags:
    for t in ts:
        tags.add(t)
 
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs
 
tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding

In [None]:
len(word2index)

10379

In [None]:
tag2index

{'VB': 1,
 'IN': 2,
 'JJ': 3,
 'VBN': 4,
 'JJR': 5,
 'PRP': 6,
 'VBD': 7,
 'CD': 8,
 'RB': 9,
 'DT': 10,
 'WRB': 11,
 'N': 12,
 'RP': 13,
 'ABBREV': 14,
 'VBP': 15,
 'CC': 16,
 'FW': 17,
 'PUNC': 18,
 'WP': 19,
 '-PAD-': 0}

In [None]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []
 
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    train_sentences_X.append(s_int)
 
for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
 
    test_sentences_X.append(s_int)
 
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])
 
for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

In [None]:
print(train_sentences_X[0])
print(test_sentences_X[0])
print(train_tags_y[0])
print(test_tags_y[0])

[466, 5610, 3270, 5097, 5605, 4841, 1358, 9773, 9508, 4792, 4808, 9773, 9688, 2266, 466, 10083, 1905, 3718, 5648, 4519, 9241, 8076, 9640, 610, 3608, 2467, 4132, 6257, 466, 1128, 6257, 1358, 3385, 9042, 3609, 534, 1791, 7050, 10083, 9177, 610, 2452, 4807, 4682, 2499, 3385, 5648, 5616]
[4409, 10041, 466, 1, 10028, 3948, 185, 8982, 10091, 6573, 10091, 6406, 2709, 5668, 729, 10041, 4253, 3948, 1, 5533, 6062, 3014, 133, 6406, 8386, 1298, 1, 6406, 4505, 3168, 4253, 5123, 9390, 10083, 4038, 1, 2709, 3948, 1992, 8323, 4673, 3785, 303, 1, 10041, 3608, 129, 10083, 8310, 1085, 7210, 3948, 7420, 1, 5616]
[16, 13, 7, 12, 12, 12, 18, 2, 12, 10, 12, 2, 12, 12, 16, 2, 12, 15, 12, 2, 3, 12, 3, 2, 13, 15, 12, 3, 16, 12, 3, 18, 2, 12, 2, 12, 12, 12, 2, 12, 2, 12, 12, 12, 12, 2, 12, 18]
[12, 12, 16, 15, 12, 16, 15, 9, 12, 12, 12, 12, 12, 13, 15, 12, 7, 16, 7, 3, 11, 6, 12, 12, 12, 16, 12, 12, 3, 7, 7, 16, 7, 2, 9, 12, 12, 16, 7, 2, 12, 2, 13, 1, 12, 13, 2, 11, 12, 12, 11, 16, 13, 15, 18]


In [None]:
MAX_LENGTH = max([len(s) for s in train_sentences_X])
MAX_LENGTH

3105

In [None]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')

In [None]:
len(train_sentences_X[0]),len(train_sentences_X[5])

(3105, 3105)

In [None]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3105, 128)         1328512   
                                                                 
 bidirectional (Bidirectiona  (None, 3105, 512)        788480    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 3105, 20)         10260     
 ibuted)                                                         
                                                                 
 activation (Activation)     (None, 3105, 20)          0         
                                                                 
Total params: 2,127,252
Trainable params: 2,127,252
Non-trainable params: 0
_________________________________________________________________


In [None]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)

In [None]:
cat_train_tags_y = to_categorical(train_tags_y, len(tag2index))

In [None]:
cat_train_tags_y.shape

(1129, 3105, 20)

In [None]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f81ab899b50>

In [None]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

accuracy: 99.33493733406067


In [None]:
test_samples = [
    "إذا تم إجراء صورة شعاعية ل البطن و صورة شعاعية ل الصدر ب وضعية الوقوف و مازال تشخيص استرواح الصفاق غير مؤكد ف عندها يجب طلب تصوير مقطعي محوسب ل البطن .".split(),
]
print(test_samples)

[['إذا', 'تم', 'إجراء', 'صورة', 'شعاعية', 'ل', 'البطن', 'و', 'صورة', 'شعاعية', 'ل', 'الصدر', 'ب', 'وضعية', 'الوقوف', 'و', 'مازال', 'تشخيص', 'استرواح', 'الصفاق', 'غير', 'مؤكد', 'ف', 'عندها', 'يجب', 'طلب', 'تصوير', 'مقطعي', 'محوسب', 'ل', 'البطن', '.']]


In [None]:
test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)
 
test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')
print(test_samples_X)

[[9851 3270  992 ...    0    0    0]]


In [None]:
predictions = model.predict(test_samples_X)
print(predictions, predictions.shape)

[[[2.4239480e-02 6.7803590e-03 1.4114192e-01 ... 2.4427839e-03
   5.8488343e-02 8.2946522e-03]
  [2.1790367e-02 6.2090796e-03 1.3488978e-01 ... 2.2918703e-03
   7.0508882e-02 7.7144867e-03]
  [1.8491639e-02 5.2619022e-03 9.8328963e-02 ... 1.9654343e-03
   5.9428673e-02 6.4718835e-03]
  ...
  [9.9999106e-01 3.9185534e-07 4.0105324e-08 ... 5.7122918e-07
   2.9324117e-06 1.9954112e-07]
  [9.9998665e-01 6.2608416e-07 5.3281596e-08 ... 9.0568295e-07
   3.7275167e-06 3.1438282e-07]
  [9.9998164e-01 9.0795635e-07 6.8095623e-08 ... 1.2972095e-06
   4.5859420e-06 4.4967894e-07]]] (1, 3105, 20)


In [None]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])
 
        token_sequences.append(token_sequence)
 
    return token_sequences

In [None]:
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})[0])

['N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'PUNC', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-',

In [None]:
from keras import backend as K
 
def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [None]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH, )))
model.add(Embedding(len(word2index), 128))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(len(tag2index))))
model.add(Activation('softmax'))
 
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy', ignore_class_accuracy(0)])
 
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3105, 128)         1328512   
                                                                 
 bidirectional_1 (Bidirectio  (None, 3105, 512)        788480    
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 3105, 20)         10260     
 tributed)                                                       
                                                                 
 activation_1 (Activation)   (None, 3105, 20)          0         
                                                                 
Total params: 2,127,252
Trainable params: 2,127,252
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=128, epochs=40, validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f82680a3d60>

In [None]:
predictions = model.predict(test_sentences_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
from sklearn.metrics import accuracy_score

# Evaluate the model
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")

# Make predictions
predictions = model.predict(test_sentences_X)

# Convert the categorical predictions to token sequences
predicted_tags = logits_to_tokens(predictions, {i: t for t, i in tag2index.items()})

# Flatten the true and predicted tags for the test set
true_tags_1d = [tag for tags_sequence in test_tags_y for tag in tags_sequence]
predicted_tags_1d = [tag for tags_sequence in predicted_tags for tag in tags_sequence]

# Calculate and print the overall accuracy
overall_accuracy = accuracy_score(true_tags_1d, predicted_tags_1d)
print(f"Overall Accuracy: {overall_accuracy:.2f}")

accuracy: 99.33448433876038
Overall Accuracy: 0.00


  score = y_true == y_pred


# 2. 2. Save LSTM (RNN)

In [None]:
# Save Model
model.save("RNN_model.h5")

In [None]:
import pickle
a_file = open("RNN_tag2index.pkl", "wb")
pickle.dump(tag2index, a_file)
a_file.close()

In [None]:
b_file = open("RNN_word2index.pkl", "wb")
pickle.dump(word2index, b_file)
b_file.close()

# 3. 1. Word2Vec (Skip-Gram Model)

In [None]:
#!pip install --upgrade allennlp
!pip install allennlp==2.5.0
!pip install google-cloud-storag
!pip install overrides
!pip install python-bidi
!pip install arabic_reshaper

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting allennlp==2.5.0
  Downloading allennlp-2.5.0-py3-none-any.whl (681 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m681.4/681.4 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock<3.1,>=3.0
  Downloading filelock-3.0.12-py3-none-any.whl (7.6 kB)
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb<0.11.0,>=0.10.0
  Downloading wandb-0.10.33-py2.py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting google-cloud-storage<1.39.0,>=1.38.0
  Downloading google_cloud_storage-1.38.0-py2.py3-none-any.whl (103 kB)
[2K     [90m━━━━━━

In [None]:
import math
import random
from collections import Counter
import numpy as np
import torch
import torch.optim as optim
from allennlp.common.file_utils import cached_path
from allennlp.data.data_loaders import SimpleDataLoader
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField
from allennlp.data.instance import Instance
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.token_embedders import Embedding
from allennlp.training import GradientDescentTrainer
from overrides import overrides
from scipy.stats import spearmanr
from torch.nn import CosineSimilarity
from torch.nn import functional
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import matplotlib.pyplot as plt
%matplotlib inline
from bidi.algorithm import get_display
import arabic_reshaper

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
@DatasetReader.register("skip_gram")
class SkipGramReader(DatasetReader):
    def __init__(self, window_size=5, vocab: Vocabulary=None):
        """A DatasetReader for reading a plain text corpus and producing instances
        for the SkipGram model.
        When vocab is not None, this runs sub-sampling of frequent words as described
        in (Mikolov et al. 2013).
        """
        super().__init__()
        self.window_size = window_size
        self.reject_probs = None
        if vocab:
            self.reject_probs = {}
            threshold = 1.e-3
            token_counts = vocab._retained_counter['token_in']  # HACK
            total_counts = sum(token_counts.values())
            for _, token in vocab.get_index_to_token_vocabulary('token_in').items():
                counts = token_counts[token]
                if counts > 0:
                    normalized_counts = counts / total_counts
                    reject_prob = 1. - math.sqrt(threshold / normalized_counts)
                    reject_prob = max(0., reject_prob)
                else:
                    reject_prob = 0.
                self.reject_probs[token] = reject_prob

    def _subsample_tokens(self, tokens):
        """Given a list of tokens, runs sub-sampling.
        Returns a new list of tokens where rejected tokens are replaced by Nones.
        """
        new_tokens = []
        for token in tokens:
            reject_prob = self.reject_probs.get(token, 0.)
            if random.random() <= reject_prob:
                new_tokens.append(None)
            else:
                new_tokens.append(token)

        return new_tokens

    @overrides
    def _read(self, file_path: str):
        with open(cached_path(file_path), "r") as text_file:
            for line in text_file:
                tokens = line.strip().split(' ')
                tokens = tokens[:1000000]  # TODO: remove

                if self.reject_probs:
                    tokens = self._subsample_tokens(tokens)
                    print(tokens[:200])  # for debugging

                for i, token in enumerate(tokens):
                    if token is None:
                        continue

                    token_in = LabelField(token, label_namespace='token_in')

                    for j in range(i - self.window_size, i + self.window_size + 1):
                        if j < 0 or i == j or j > len(tokens) - 1:
                            continue

                        if tokens[j] is None:
                            continue

                        token_out = LabelField(tokens[j], label_namespace='token_out')
                        yield Instance({'token_in': token_in, 'token_out': token_out})

In [None]:
EMBEDDING_DIM = 256
BATCH_SIZE = 256
CUDA_DEVICE = -1

In [None]:
class SkipGramModel(Model):
    def __init__(self, vocab, embedding_in):
        super().__init__(vocab)
        self.embedding_in = embedding_in
        self.linear = torch.nn.Linear(
            in_features=EMBEDDING_DIM,
            out_features=vocab.get_vocab_size('token_out'),
            bias=False)

    def forward(self, token_in, token_out):
        embedded_in = self.embedding_in(token_in)
        logits = self.linear(embedded_in)
        loss = functional.cross_entropy(logits, token_out)

        return {'loss': loss}

In [None]:
def get_related(token: str, embedding: Model, vocab: Vocabulary, num_synonyms: int = 10):
    """Given a token, return a list of top N most similar words to the token."""
    token_id = vocab.get_token_index(token, 'token_in')
    token_vec = embedding.weight[token_id]
    cosine = CosineSimilarity(dim=0)
    sims = Counter()

    for index, token in vocab.get_index_to_token_vocabulary('token_in').items():
        sim = cosine(token_vec, embedding.weight[index]).item()
        sims[token] = sim

    return sims.most_common(num_synonyms)

In [None]:
reader = SkipGramReader()
talafeef = reader.read("/content/Talafeef--Seg.txt")

In [None]:
talafeef = list(talafeef)
print(len(talafeef))



591610


In [None]:
vocab = Vocabulary.from_instances(
    talafeef, min_count={'token_in': 2, 'token_out': 2},max_vocab_size=100000)

building vocab: 100%|##########| 591610/591610 [00:01<00:00, 389733.30it/s]


In [None]:
data_loader = SimpleDataLoader(talafeef, batch_size=BATCH_SIZE)
data_loader.index_with(vocab)

In [None]:
vocab.get_vocab_size

<bound method Vocabulary.get_vocab_size of Vocabulary with namespaces:  token_in, Size: 11421 || token_out, Size: 11421 || Non Padded Namespaces: {'*labels', '*tags'}>

In [None]:
embedding_in = Embedding(num_embeddings=vocab.get_vocab_size('token_in'),
                         embedding_dim=EMBEDDING_DIM)

In [None]:
model = SkipGramModel(vocab=vocab,
                      embedding_in=embedding_in)

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=data_loader,
    num_epochs=5,
    cuda_device=-1)

In [None]:
trainer.train()

batch_loss: 8.8472, loss: 8.2816 ||: 100%|##########| 2311/2311 [07:49<00:00,  4.92it/s]
batch_loss: 8.4069, loss: 7.2477 ||: 100%|##########| 2311/2311 [07:31<00:00,  5.12it/s]
batch_loss: 8.1822, loss: 7.0402 ||: 100%|##########| 2311/2311 [07:37<00:00,  5.05it/s]
batch_loss: 7.8415, loss: 6.8903 ||: 100%|##########| 2311/2311 [07:59<00:00,  4.82it/s]
batch_loss: 7.6669, loss: 6.6898 ||: 100%|##########| 2311/2311 [07:50<00:00,  4.91it/s]


{'best_epoch': 4,
 'peak_worker_0_memory_MB': 1235.10546875,
 'peak_gpu_0_memory_MB': 0,
 'training_duration': '0:38:49.447625',
 'epoch': 4,
 'training_loss': 6.689805425745437,
 'training_worker_0_memory_MB': 1235.10546875,
 'training_gpu_0_memory_MB': 0.0}

# 3. 2. Save Skip-Gram Model

In [None]:
torch.save(model, r"SkipGram_model.pt")
#with open("model.th", 'wb') as f:
#    torch.save(model.state_dict(), f)
#trainer.save('model.pt')

In [None]:
print(get_related('العلوم', embedding_in, vocab))

[('العلوم', 1.0), ('الممارسة', 0.8805922865867615), ('التاريخي', 0.8782991766929626), ('البعد', 0.873658299446106), ('العلاقة', 0.8732448220252991), ('العقلاني', 0.8651537299156189), ('دراسات', 0.8614330887794495), ('إدراك', 0.8558136224746704), ('سرقة', 0.8554249405860901), ('أسماء', 0.8533512353897095)]


In [None]:
print(get_related('محمد', embedding_in, vocab))

[('محمد', 1.0), ('بارك', 0.820411741733551), ('آله', 0.7994131445884705), ('رسولك', 0.7942121624946594), ('إبراهيم', 0.76529860496521), ('عبدك', 0.7583552598953247), ('نبينا', 0.7350561022758484), ('صل', 0.7037534713745117), ('صلاة', 0.7007614374160767), ('أصحابه', 0.6974852085113525)]


In [None]:
print(get_related('السعودية', embedding_in, vocab))

[('السعودية', 1.0), ('العربية', 0.8396695852279663), ('المملكة', 0.8040644526481628), ('ملك', 0.7038750648498535), ('عبدالعزيز', 0.6806778311729431), ('سعود', 0.6583635210990906), ('الحكم', 0.6306524872779846), ('آل', 0.627746045589447), ('الرسمية', 0.6237347722053528), ('الاطلاع', 0.6213890314102173)]


In [None]:
print(get_related('النظام', embedding_in, vocab))

[('النظام', 1.0), ('أحكام', 0.7862368822097778), ('تعديل', 0.7843925356864929), ('الرسمية', 0.7344747185707092), ('الأساسي', 0.7308825254440308), ('الجريدة', 0.7304423451423645), ('اللائحة', 0.7206048369407654), ('تعدل', 0.7172296643257141), ('اختصاصاته', 0.705812394618988), ('ثالثا', 0.6978699564933777)]


In [None]:
print(get_related('الشباب', embedding_in, vocab))

[('الشباب', 1.0), ('مهارات', 0.8578124046325684), ('المهارة', 0.850523829460144), ('الوقوع', 0.8404920101165771), ('سوق', 0.8345710635185242), ('الطاقة', 0.83315509557724), ('مطلوبة', 0.8318454623222351), ('القطاعين', 0.8315698504447937), ('الموجودة', 0.8243515491485596), ('جميعها', 0.823792040348053)]


In [None]:
token_id = vocab.get_token_index("الله", 'token_in')
token_vec = embedding_in.weight[token_id]

In [None]:
cosine = CosineSimilarity(dim=0)

In [None]:
vocab.get_vocab_size

<bound method Vocabulary.get_vocab_size of Vocabulary with namespaces:  token_in, Size: 11421 || token_out, Size: 11421 || Non Padded Namespaces: {'*labels', '*tags'}>

In [None]:
import numpy as np
np.unique(np.array(talafeef))

array(['token_in', 'token_out'], dtype='<U9')

In [None]:
len(talafeef)

591610

In [None]:
vocab.get_vocab_size

<bound method Vocabulary.get_vocab_size of Vocabulary with namespaces:  token_in, Size: 11421 || token_out, Size: 11421 || Non Padded Namespaces: {'*labels', '*tags'}>

In [None]:
vocab.get_token_to_index_vocabulary('token_in').items()

dict_items([('@@PADDING@@', 0), ('@@UNKNOWN@@', 1), ('و', 2), ('،', 3), ('ب', 4), ('ل', 5), ('.', 6), ('في', 7), ('من', 8), ('على', 9), ('أن', 10), ('الله', 11), ('ف', 12), ('ما', 13), ('لا', 14), ('إلى', 15), ('المادة', 16), ('عن', 17), ('أو', 18), ('هذا', 19), ('التي', 20), ('ذلك', 21), ('مجلس', 22), ('إن', 23), ('كل', 24), ('هو', 25), ('هذه', 26), ('عليه', 27), ('مع', 28), ('الصورة', 29), ('الأمعاء', 30), ('اللهم', 31), ('الذي', 32), ('بعد', 33), ('قد', 34), ('النظام', 35), ('تظهر', 36), ('الوزراء', 37), ('إلا', 38), ('له', 39), ('هي', 40), ('الملك', 41), ('سلم', 42), ('قال', 43), ('كما', 44), ('إذا', 45), ('كان', 46), ('رئيس', 47), ('صلى', 48), ('بين', 49), ('يكون', 50), ('ك', 51), ('المجلس', 52), ('أكثر', 53), ('الدولة', 54), ('أي', 55), ('حيث', 56), ('شكل', 57), ('المنطقة', 58), ('الهيئة', 59), ('غير', 60), ('أنه', 61), ('البطن', 62), ('الشعاعية', 63), ('تكون', 64), ('الذين', 65), ('يا', 66), ('لم', 67), ('العمل', 68), ('خلال', 69), ('عشرة', 70), ('الغاز', 71), ('عليها', 72), ('ح

In [None]:
vocab.get_vocab_size("token_out")

11421

In [None]:
embedding_in.weight.shape

torch.Size([11421, 256])

# 4. 1. Word2Vec (CBOW model)

In [None]:
import numpy as np
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import Counter

In [None]:
data=open("/content/Talafeef--Seg.txt",'r')
talafeef_data = [text for text in data if text.count(' ') >= 2]

In [None]:
talafeef_data[0]

' إذا تم إجراء صورة شعاعية ل البطن و صورة شعاعية ل الصدر ب وضعية الوقوف و مازال تشخيص استرواح الصفاق غير مؤكد ف عندها يجب طلب تصوير مقطعي محوسب ل البطن . يعرض التصوير المقطعي المحوسب المريض ل كمية أكبر من الإشعاع لكنه سيظهر ب وضوح وجود الغاز السهم الأبيض و قد يشخص السبب المستبطن . و حاليا ف إن معظم المرضى غير المستقرين الذين يشك لديهم ب استرواح الصفاق يخضعون مباشرة ل التصوير المقطعي المحوسب . مثال مثال الصورة صورتان شعاعيتان ل البطن متطابقتان تظهران استرواح صفاق كبير . يوجد عرى معوية يحددها خارجيا غاز من كلا الجانبين ب شكل يتوافق مع علامة . تظهر الصورة اليمنى المناطق التي تكون فيها علامة أكثر وضوحا ب لون بني و فيروزي لمعة الأمعاء معلمة ب لون بني أما الغاز الحر الذي يحدد جدار الأمعاء خارجيا معلم ب لون فيروزي صورتان شعاعيتان متطابقتان تظهران استرواح صفاق كبير . يوجد عرى معوية يحددها خارجيا غاز من كلا الجانبين ب شكل يتوافق مع علامة تظهر الصورة اليمنى المناطق حيث يكون استرواح الصفاق أكثر وضوحا ب اللون الفيروزي و لمعة الأمعاء معلمة ب اللون البني . يمكنك أيضا أن ترى الغاز يحدد الكبد خارجيا ك

In [None]:
vectorize = Tokenizer()
vectorize.fit_on_texts(talafeef_data)
talafeef_data = vectorize.texts_to_sequences(talafeef_data)
total_vocab = sum(len(s) for s in talafeef_data)
word_count = len(vectorize.word_index) + 1
window_size = 2

In [None]:
word_count

11415

In [None]:
total_vocab

57741

In [None]:
!pip install --upgrade tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting google-auth<3,>=1.6.3
  Downloading google_auth-2.17.3-py2.py3-none-any.whl (178 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.2/178.2 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: google-auth
  Attempting uninstall: google-auth
    Found existing installation: google-auth 1.35.0
    Uninstalling google-auth-1.35.0:
      Successfully uninstalled google-auth-1.35.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-storage 1.38.0 requires google-auth<2.0dev,>=1.11.0, but you have google-auth 2.17.3 which is incompatible.
google-cloud-core 1.7.3 requires google-auth<2.0dev,>=1.24.0, but you have google-auth 2.17.3 which is incompatible.[0m[31m
[0mSuccessfully installed google-auth-

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K

In [None]:
def cbow_model(data, window_size, total_vocab):
    total_length = window_size*2
    for text in data:
        text_len = len(text)
        for idx, word in enumerate(text):
            context_word = []
            target   = []            
            begin = idx - window_size
            end = idx + window_size + 1
            context_word.append([text[i] for i in range(begin, end) if 0 <= i < text_len and i != idx])
            target.append(word)
            contextual = pad_sequences(context_word, maxlen=total_length)
            final_target = np_utils.to_categorical(target, total_vocab)
            yield(contextual, final_target)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=total_vocab, output_dim=100, input_length=window_size*2))
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,)))
model.add(Dense(total_vocab, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
for i in range(10):
    cost = 0
    for x, y in cbow_model(talafeef_data, window_size, total_vocab):
        cost += model.train_on_batch(x, y)
    print(i, cost)

0 511941.76701672375
1 465041.3886981467
2 418825.4199489064
3 372395.67551187775
4 328930.94624006166
5 290391.46792476973
6 257724.52018575402
7 229016.75827746984
8 203964.11474394135
9 182369.23941060208


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            5774100   
                                                                 
 lambda (Lambda)             (None, 100)               0         
                                                                 
 dense (Dense)               (None, 57741)             5831841   
                                                                 
Total params: 11,605,941
Trainable params: 11,605,941
Non-trainable params: 0
_________________________________________________________________


# 4. 2. Save CBOW Model 

In [None]:
model.save("cbow_model.h5")

In [None]:
weights = model.get_weights()[0]
w2v_my = {}

for word, index in vectorize.word_index.items():
    w2v_my[word] = weights[index]

np.savez('embeddings.npz', **w2v_my)
embeddings = np.load('embeddings.npz')

In [None]:
embeddings["الله"]

array([ 0.27993688,  0.07120431,  0.26767626,  0.25282174,  1.0951012 ,
        1.6889765 , -0.20246087,  1.7091613 , -0.7632016 ,  1.120734  ,
       -0.4092044 ,  0.5716477 , -0.29057154, -0.08226096, -0.42095208,
        0.08825853,  0.18974511, -0.08490095,  0.7360795 ,  0.14302415,
        0.7060526 ,  1.6797556 , -0.5505946 ,  1.7666436 , -0.21437186,
       -0.63364285,  0.47310987, -0.22869018,  0.29482394,  0.49904194,
       -0.21884209, -1.3027201 , -0.32482287, -1.746755  ,  1.1081969 ,
       -0.1649091 ,  0.3653062 ,  0.18415271, -0.93456995,  0.7247719 ,
       -0.5408487 , -0.7594056 ,  0.672341  , -0.1186127 , -0.5444066 ,
       -0.22125424, -0.6010884 ,  0.60653484,  0.84577733,  0.14000566,
        1.457906  , -0.503109  ,  0.39016327,  0.7339899 , -0.09890876,
        0.52188367,  0.08965507, -0.66726094,  1.3173606 , -0.5732715 ,
       -1.1861007 ,  0.5143744 ,  0.46508154,  1.6168398 , -1.7130787 ,
       -0.691999  ,  0.24741264,  0.39214978, -0.03980275, -0.13

In [None]:
def get_related(token: str, num_synonyms: int = 10):
    """Given a token, return a list of top N most similar words to the token."""
    token_vec = w2v_my[token]
    sims = Counter()

    for w,vec in w2v_my.items():
        sim = np.dot(token_vec, vec)/(np.linalg.norm(token_vec) * np.linalg.norm(vec))
        sims[w] = sim

    return sims.most_common(num_synonyms)

In [None]:
get_related("أشعة",10)

[('أشعة', 1.0000001),
 ('موجات', 0.5669669),
 ('رسم', 0.4941221),
 ('صوتية', 0.46655378),
 ('يوصف', 0.44993737),
 ('تفرج', 0.4473002),
 ('صالحا', 0.43631655),
 ('أجلهما', 0.43050963),
 ('مزيدا', 0.42970288),
 ('الطبية', 0.42583993)]

In [None]:
get_related("النبي",10)

[('النبي', 1.0),
 ('نبيهم', 0.52969635),
 ('رسول', 0.5217098),
 ('الإمام', 0.5195494),
 ('أخرجه', 0.48768666),
 ('كتابه', 0.4745463),
 ('حبيبهم', 0.47273573),
 ('الرحمة', 0.47043207),
 ('الناجح', 0.46889913),
 ('منكن', 0.46131325)]

In [None]:
get_related("الهندسة",10)

[('الهندسة', 1.0000001),
 ('أبل', 0.5488208),
 ('نمو', 0.5333013),
 ('صفها', 0.51157445),
 ('الحديثة', 0.5011609),
 ('الثقافية', 0.49718523),
 ('العنصرية', 0.49681476),
 ('الطيران', 0.49663526),
 ('أدوات', 0.48633814),
 ('إبراز', 0.48501801)]

In [None]:
np.array(list(w2v_my.values())).shape

(11414, 100)

# 5.1. araBERTv02 (pre-trained model)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import logging
import glob
import os
#from bert_function import bert_processing, get_related
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
! curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
! sudo apt-get install git-lfs
! git lfs install
!  clone https://huggingface.co/aubmindlab/bert-base-arabertv2
! tar -C ./MODEL_NAME -zxvf /content/bert-base-arabertv02/tf1_model.tar.gz

Detected operating system as Ubuntu/focal.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Detected apt version as 2.0.9
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... Packagecloud gpg key imported to /etc/apt/keyrings/github_git-lfs-archive-keyring.gpg
done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (3.3.0).
0 upgraded, 0 newly installed, 0 to remove and 33 not upgraded.
Git LFS initialized.
/bin/bash: clone: command not found
tar (child): /content/bert-base-arabertv02/tf1_model.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [None]:
! pip install pyarabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! git clone https://github.com/aub-mind/arabert.git

fatal: destination path 'arabert' already exists and is not an empty directory.


In [None]:
! pip install farasapy
from farasa.segmenter import FarasaSegmenter 
from arabert.preprocess import ArabertPreprocessor

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting farasapy
  Downloading farasapy-0.0.14-py3-none-any.whl (11 kB)
Installing collected packages: farasapy
Successfully installed farasapy-0.0.14


In [None]:
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

arabert_prep = ArabertPreprocessor(model_name=model_name)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


100%|██████████| 241M/241M [00:19<00:00, 12.6MiB/s]




In [None]:
with open("/content/Talafeef--Seg.txt",'r', encoding='utf-8') as f:
    dataset_tokens = f.read()

tokens = dataset_tokens.split(" ")
number_of_tokens = len(tokens)

In [None]:
preprocessed_text = arabert_prep.preprocess(dataset_tokens)

In [None]:
preprocessed_text.split(".")[0]

'إذا تم إجراء صور +ة شعاعي +ة ل ال+ بطن و صور +ة شعاعي +ة ل ال+ صدر ب وضعي +ة ال+ وقوف و مازال تشخيص استرواح ال+ صفاق غير مؤكد ف عند +ها يجب طلب تصوير مقطع +ي محوسب ل ال+ بطن '

In [None]:
segmenter = FarasaSegmenter(interactive=True)
segmented = segmenter.segment(preprocessed_text)



In [None]:
segmented

'إذا تم إجراء صور + +ة شعاعي + +ة ل ال + بطن و صور + +ة شعاعي + +ة ل ال + صدر ب وضعي + +ة ال + وقوف و مازال تشخيص استرواح ال + صفاق غير مؤكد ف عند + ها يجب طلب تصوير مقطع + ي محوسب ل ال + بطن . يعرض ال + تصوير ال + مقطع + ي ال + محوسب ال + مريض ل كمي + +ة أكبر من ال + إشعاع لكن + ه س + يظهر ب وضوح وجود ال + غاز ال + سهم ال + أبيض و قد يشخص ال + سبب ال + مستبطن . و حالي + ا ف إن معظم ال + مرضى غير ال + مستقر + ين الذين يشك لدي + هم ب استرواح ال + صفاق يخضع + ون مباشر + +ة ل ال + تصوير ال + مقطع + ي ال + محوسب . مثال مثال ال + صور + +ة صورتان شعاعيتان ل ال + بطن متطابقتان تظهر + ان استرواح صفاق كبير . يوجد عرى معوي + +ة يحدد + ها خارجي + ا غاز من كلا ال + جانب + ين ب شكل يتوافق مع علام + +ة . تظهر ال + صور + +ة ال + يمنى ال + مناطق التي تكون في + ها علام + +ة أكثر وضوح + ا ب لون بني و فيروز + ي ل + مع + +ة ال + أمعاء معلم + +ة ب لون بني أما ال + غاز ال + حر الذي يحدد جدار ال + أمعاء خارجي + ا معلم ب لون فيروز + ي صورتان شعاعيتان متطابقتان تظهر + ان استرواح صفاق كبير . يوجد عرى معوي + +ة 

In [None]:
splitted_text = preprocessed_text.split(".")
marked_text=''
for sent in splitted_text:
  marked_text = marked_text +  "[CLS] " + sent + " [SEP]"

# Split the sentence into tokens.
#tokenized_text = tokenizer.tokenize(marked_text)
tokenized_text = tokenizer.tokenize(marked_text)
# Map the token strings to their vocabulary indeces.
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Display the words with their indeces.
#for tup in zip(tokenized_text, indexed_tokens):
  #print('{:<12} {:>6,}'.format(tup[0], tup[1]))

segments_ids=[]
id=0
for token in tokenized_text:
  if token != '[CLS]':
    segments_ids.append(id)
  else:
    segments_ids.append(id)
    id = id + 1
print(len(indexed_tokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (92237 > 512). Running this sequence through the model will result in indexing errors


92237


In [None]:
tokenized_text[:10]

['[CLS]', 'إذا', 'تم', 'إجراء', 'صور', '+ة', 'شعاعي', '+ة', 'ل', 'ال+']

In [None]:
len(segments_ids), len(tokenized_text)

(92237, 92237)

In [None]:
indexed_tokens[:10], segments_ids[:10],tokenized_text[:10]

([33, 985, 408, 1061, 947, 12, 25050, 12, 162, 20],
 [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 ['[CLS]', 'إذا', 'تم', 'إجراء', 'صور', '+ة', 'شعاعي', '+ة', 'ل', 'ال+'])

In [None]:
def bert_processing (model, segments_ids, indexed_tokens, tokens_in_chunk):
		
	tensor_input_ids = torch.tensor(indexed_tokens).unsqueeze(0)
	segments_tensor = torch.tensor(segments_ids).unsqueeze(0)
	output = model(tensor_input_ids,segments_tensor)

	hidden_states = output[0][0]

	token_vecs_sum = []

	for token in hidden_states.detach().numpy():
		token_vecs_sum.append(token)

		# `token` is a [12 x 768] tensor

		# Sum the vectors from the last four layers.
		#sum_vec = torch.sum(token[-4:], dim=0)
		
		# Use `sum_vec` to represent `token`.
		#token_vecs_sum.append(sum_vec)

	#print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))


	token_vecs_cat_array=np.empty((tokens_in_chunk,768),float)
	for t in token_vecs_sum:
	  np.append(token_vecs_cat_array,np.array(t))
	
	#token_vecs_cat_array.shape
	return token_vecs_cat_array


In [None]:
final_token_vecs_array = np.empty((len(indexed_tokens),768),float)

for i in range(0,len(segments_ids),512):
    x=i
    chunk_segments_ids = segments_ids[x:x+512]
    chunk_indexed_tokens = indexed_tokens[x:x+512]
    chunk_token_vecs = bert_processing(model=model, segments_ids=chunk_segments_ids,
                                       indexed_tokens=chunk_indexed_tokens, 
                                       tokens_in_chunk=len(chunk_indexed_tokens))
    #np.append(final_token_vecs_array, np.array(chunk_token_vecs))
    final_token_vecs_array[x:x+512] = chunk_token_vecs

In [None]:
final_token_vecs_array.shape

(92237, 768)

In [None]:
from collections import Counter
def get_related(token: str, num_synonyms: int = 10, 
                tokenized_text = tokenized_text ,final_token_vecs_array=final_token_vecs_array):
    """Given a token, return a list of top N most similar words to the token."""
    token_vec = final_token_vecs_array[tokenized_text.index(token)]
    sims = Counter()

    for i in range (final_token_vecs_array.shape[0]):
        sim = np.dot(token_vec, final_token_vecs_array[i])/(np.linalg.norm(token_vec) * np.linalg.norm(final_token_vecs_array[i]))
        sims[tokenized_text[i]] = sim

    return sims.most_common(num_synonyms)

# 5.2. Save BERT model  

In [None]:
#save token vecs
from numpy import asarray
from numpy import savez_compressed 
savez_compressed('token_vecs_cat_array.npz', final_token_vecs_array)

In [None]:
#test if saved token vec saving done right (it should return array([[ True,  True,  True, ..., )
from numpy import load
token_vecs_cat_array2 = load('token_vecs_cat_array.npz')
token_vecs_cat_array2==final_token_vecs_array

data= load('token_vecs_cat_array.npz')
token_vecs_cat_array2 = data['arr_0']  
token_vecs_cat_array2==final_token_vecs_array

  token_vecs_cat_array2==final_token_vecs_array


array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [None]:
#save tokenized text
import pickle
open_file = open("tokenized_text.pkl", "wb")
pickle.dump(tokenized_text, open_file)
open_file.close()

In [None]:
#check if tokenized text saving done right (should return True)
open_file = open("tokenized_text.pkl", "rb")
tokenized_text2 = pickle.load(open_file)
open_file.close()

tokenized_text2==tokenized_text 

True

In [None]:
#save tokenized vec (method 2)
import pickle
open_file = open("token_vecs_cat_array.pkl", "wb")
pickle.dump(final_token_vecs_array, open_file)
open_file.close() 

In [None]:
#check if tokenized text saving done right (method 2)
open_file = open("token_vecs_cat_array.pkl", "rb")
tokenized_vec2 = pickle.load(open_file)
open_file.close()

tokenized_vec2==final_token_vecs_array 

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [None]:
get_related("مقطع",10)

  sim = np.dot(token_vec, final_token_vecs_array[i])/(np.linalg.norm(token_vec) * np.linalg.norm(final_token_vecs_array[i]))


[('شعاعي', nan),
 ('بطن', nan),
 ('تم', nan),
 ('ال+', nan),
 ('صور', nan),
 ('+ة', nan),
 ('[CLS]', nan),
 ('ل', nan),
 ('إجراء', nan),
 ('إذا', nan)]

# 5.1. araBERTv02 (pre-trained model)

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import logging
import glob
import os
import matplotlib.pyplot as plt

In [None]:
! curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
! sudo apt-get install git-lfs
! git lfs install
!  clone https://huggingface.co/aubmindlab/bert-base-arabertv2
! tar -C ./MODEL_NAME -zxvf /content/bert-base-arabertv02/tf1_model.tar.gz

Detected operating system as Ubuntu/focal.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Detected apt version as 2.0.9
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... Packagecloud gpg key imported to /etc/apt/keyrings/github_git-lfs-archive-keyring.gpg
done.
Running apt-get update... done.

The repository is setup! You can now install packages.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
git-lfs is already the newest version (3.3.0).
0 upgraded, 0 newly installed, 0 to remove and 33 not upgraded.
Git LFS initialized.
/bin/bash: clone: command not found
tar (child): /content/bert-base-arabertv02/tf1_model.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [None]:
! pip install pyarabic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
! git clone https://github.com/aub-mind/arabert.git

fatal: destination path 'arabert' already exists and is not an empty directory.


In [None]:
model_name = "aubmindlab/bert-base-arabertv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

arabert_prep = ArabertPreprocessor(model_name=model_name)

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
with open("/content/Talafeef--Seg.txt",'r', encoding='utf-8') as f:
    dataset_tokens = f.read()

tokens = dataset_tokens.split(" ")
number_of_tokens = len(tokens)

In [None]:
preprocessed_text = arabert_prep.preprocess(dataset_tokens)

In [None]:
preprocessed_text.split(".")[0]

'إذا تم إجراء صور +ة شعاعي +ة ل ال+ بطن و صور +ة شعاعي +ة ل ال+ صدر ب وضعي +ة ال+ وقوف و مازال تشخيص استرواح ال+ صفاق غير مؤكد ف عند +ها يجب طلب تصوير مقطع +ي محوسب ل ال+ بطن '

In [None]:
splitted_text = preprocessed_text.split(".")
marked_text=''
for sent in splitted_text:
  marked_text = marked_text +  "[CLS] " + sent + " [SEP]"

tokenized_text = tokenizer.tokenize(marked_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

segments_ids=[]
id=0
for token in tokenized_text:
  if token != '[CLS]':
    segments_ids.append(id)
  else:
    segments_ids.append(id)
    id = id + 1
print(len(indexed_tokens))

Token indices sequence length is longer than the specified maximum sequence length for this model (92237 > 512). Running this sequence through the model will result in indexing errors


92237


In [None]:
tokenized_text[:10]

['[CLS]', 'إذا', 'تم', 'إجراء', 'صور', '+ة', 'شعاعي', '+ة', 'ل', 'ال+']

In [None]:
len(segments_ids), len(tokenized_text)

(92237, 92237)

In [None]:
indexed_tokens[:10], segments_ids[:10],tokenized_text[:10]

([33, 985, 408, 1061, 947, 12, 25050, 12, 162, 20],
 [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 ['[CLS]', 'إذا', 'تم', 'إجراء', 'صور', '+ة', 'شعاعي', '+ة', 'ل', 'ال+'])

In [None]:
def bert_processing (model, segments_ids, indexed_tokens, tokens_in_chunk):
		
	tensor_input_ids = torch.tensor(indexed_tokens).unsqueeze(0)
	segments_tensor = torch.tensor(segments_ids).unsqueeze(0)
	output = model(tensor_input_ids,segments_tensor)

	hidden_states = output[0][0]

	token_vecs_sum = []

	for token in hidden_states.detach().numpy():
		token_vecs_sum.append(token)

		# `token` is a [12 x 768] tensor

		# Sum the vectors from the last four layers.
		#sum_vec = torch.sum(token[-4:], dim=0)
		
		# Use `sum_vec` to represent `token`.
		#token_vecs_sum.append(sum_vec)

	#print ('Shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))


	token_vecs_cat_array=np.empty((tokens_in_chunk,768),float)
	for t in token_vecs_sum:
	  np.append(token_vecs_cat_array,np.array(t))
	
	#token_vecs_cat_array.shape
	return token_vecs_cat_array

In [None]:
final_token_vecs_array = np.empty((len(indexed_tokens),768),float)

for i in range(0,len(segments_ids),512):
    x=i
    chunk_segments_ids = segments_ids[x:x+512]
    chunk_indexed_tokens = indexed_tokens[x:x+512]
    chunk_token_vecs = bert_processing(model=model, segments_ids=chunk_segments_ids,
                                       indexed_tokens=chunk_indexed_tokens, 
                                       tokens_in_chunk=len(chunk_indexed_tokens))
    #np.append(final_token_vecs_array, np.array(chunk_token_vecs))
    final_token_vecs_array[x:x+512] = chunk_token_vecs

In [None]:
final_token_vecs_array.shape

(92237, 768)

In [None]:
from collections import Counter
def get_related(token: str, num_synonyms: int = 10, 
                tokenized_text = tokenized_text ,final_token_vecs_array=final_token_vecs_array):
    """Given a token, return a list of top N most similar words to the token."""
    token_vec = final_token_vecs_array[tokenized_text.index(token)]
    sims = Counter()

    for i in range (final_token_vecs_array.shape[0]):
        sim = np.dot(token_vec, final_token_vecs_array[i])/(np.linalg.norm(token_vec) * np.linalg.norm(final_token_vecs_array[i]))
        sims[tokenized_text[i]] = sim

    return sims.most_common(num_synonyms)

# 5.2. Save BERT model  

In [None]:
#save token vecs
from numpy import asarray
from numpy import savez_compressed 
savez_compressed('token_vecs_cat_array.npz', final_token_vecs_array)

In [None]:
#test if saved token vec saving done right (it should return array([[ True,  True,  True, ..., )
from numpy import load
token_vecs_cat_array2 = load('token_vecs_cat_array.npz')
token_vecs_cat_array2==final_token_vecs_array

data= load('token_vecs_cat_array.npz')
token_vecs_cat_array2 = data['arr_0']  
token_vecs_cat_array2==final_token_vecs_array

  token_vecs_cat_array2==final_token_vecs_array


array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [None]:
#save tokenized text
import pickle
open_file = open("new_tokenized_text.pkl", "wb")
pickle.dump(tokenized_text, open_file)
open_file.close()

In [None]:
#check if tokenized text saving done right (should return True)
open_file = open("new_tokenized_text.pkl", "rb")
tokenized_text2 = pickle.load(open_file)
open_file.close()

tokenized_text2==tokenized_text 

True

In [None]:
#save tokenized vec (method 2)
import pickle
open_file = open("token_vecs_cat_array.pkl", "wb")
pickle.dump(final_token_vecs_array, open_file)
open_file.close() 

In [None]:
#check if tokenized text saving done right (method 2)
open_file = open("token_vecs_cat_array.pkl", "rb")
tokenized_vec2 = pickle.load(open_file)
open_file.close()

tokenized_vec2==final_token_vecs_array 

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [None]:
get_related("مقطع",10)

  sim = np.dot(token_vec, final_token_vecs_array[i])/(np.linalg.norm(token_vec) * np.linalg.norm(final_token_vecs_array[i]))


[('شعاعي', nan),
 ('بطن', nan),
 ('تم', nan),
 ('ال+', nan),
 ('صور', nan),
 ('+ة', nan),
 ('[CLS]', nan),
 ('ل', nan),
 ('إجراء', nan),
 ('إذا', nan)]

# Fine-tuned araBERTv02

In [None]:
!pip install virtualenv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting virtualenv
  Downloading virtualenv-20.22.0-py3-none-any.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting distlib<1,>=0.3.6
  Downloading distlib-0.3.6-py2.py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.5/468.5 kB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
Collecting filelock<4,>=3.11
  Downloading filelock-3.12.0-py3-none-any.whl (10 kB)
Installing collected packages: distlib, filelock, virtualenv
  Attempting uninstall: filelock
    Found existing installation: filelock 3.0.12
    Uninstalling filelock-3.0.12:
      Successfully uninstalled filelock-3.0.12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.

In [None]:
!source my_env/bin/activate

In [None]:
!my_env\Scripts\activate

/bin/bash: my_envScriptsactivate: command not found


In [None]:
!pip install fsspec==2021.8.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fsspec==2021.8.1
  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.3/119.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2023.4.0
    Uninstalling fsspec-2023.4.0:
      Successfully uninstalled fsspec-2023.4.0
Successfully installed fsspec-2021.8.1


In [None]:
input_file_path = "/content/Talafeef--Seg.txt"
output_file_path = "/content/preprocessed_dataset.txt"

with open(input_file_path, "r", encoding="utf-8") as input_file:
    text = input_file.read()

sentences = text.split(". ")

with open(output_file_path, "w", encoding="utf-8") as output_file:
    for sentence in sentences:
        output_file.write(sentence.strip() + "\n")

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from datasets import load_dataset

def prepare_dataset(file_path, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    dataset = load_dataset("text", data_files={"train": file_path})
    tokenized_dataset = dataset.map(tokenize_function, batched=True, num_proc=4)
    return tokenized_dataset

tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
file_path = "/content/preprocessed_dataset.txt"
tokenized_dataset = prepare_dataset(file_path, tokenizer)

model = AutoModelForMaskedLM.from_pretrained("aubmindlab/bert-base-arabertv02")

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    save_steps=10_000,
    save_total_limit=2,
    logging_steps=200,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
print("Number of examples in the tokenized dataset:", len(tokenized_dataset["train"]))

Number of examples in the tokenized dataset: 1402


In [None]:
trainer.save_model("fine_tuned_arabertv02")

Saving model checkpoint to fine_tuned_arabertv02
Configuration saved in fine_tuned_arabertv02/config.json
Model weights saved in fine_tuned_arabertv02/pytorch_model.bin
tokenizer config file saved in fine_tuned_arabertv02/tokenizer_config.json
Special tokens file saved in fine_tuned_arabertv02/special_tokens_map.json


In [None]:
!zip -r fine_tuned_arabertv02.zip fine_tuned_arabertv02
!pip install google-colab
from google.colab import files
files.download('fine_tuned_arabertv02.zip')

  adding: fine_tuned_arabertv02/ (stored 0%)
  adding: fine_tuned_arabertv02/config.json (deflated 47%)
  adding: fine_tuned_arabertv02/training_args.bin (deflated 49%)
  adding: fine_tuned_arabertv02/pytorch_model.bin (deflated 7%)
  adding: fine_tuned_arabertv02/special_tokens_map.json (deflated 40%)
  adding: fine_tuned_arabertv02/tokenizer_config.json (deflated 36%)
  adding: fine_tuned_arabertv02/tokenizer.json (deflated 66%)
  adding: fine_tuned_arabertv02/vocab.txt (deflated 65%)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

def prepare_dataset(examples, tokenizer):
    tokenized_examples = tokenizer(examples, padding="max_length", truncation=True)

    # Convert the tokenized examples to a list of dictionaries
    tokenized_examples_list = [dict(zip(tokenized_examples.keys(), col)) for col in zip(*tokenized_examples.values())]

    # Split the dataset into train, validation, and test sets
    train_size = 0.8
    val_size = 0.1

    train_list, temp_list = train_test_split(tokenized_examples_list, train_size=train_size, shuffle=True, random_state=42)
    val_list, test_list = train_test_split(temp_list, train_size=val_size/(1-train_size), shuffle=True, random_state=42)

    # Convert the lists of dictionaries back to Dataset objects
    train_dataset = Dataset.from_dict({key: [d[key] for d in train_list] for key in tokenized_examples.keys()})
    val_dataset = Dataset.from_dict({key: [d[key] for d in val_list] for key in tokenized_examples.keys()})
    test_dataset = Dataset.from_dict({key: [d[key] for d in test_list] for key in tokenized_examples.keys()})

    return train_dataset, val_dataset, test_dataset

In [None]:
file_path = "/content/preprocessed_dataset.txt"

# Read the text file and split it into a list of examples
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
examples = text.strip().split('\n')
examples = [example.strip() for example in examples if example.strip()]

print("Number of examples after filtering empty examples:", len(examples))

# Prepare the datasets
tokenized_train_dataset, tokenized_val_dataset, tokenized_test_dataset = prepare_dataset(examples, tokenizer)

Number of examples after filtering empty examples: 1425


In [None]:
print("First few examples:", examples[:5])

First few examples: ['إذا تم إجراء صورة شعاعية ل البطن و صورة شعاعية ل الصدر ب وضعية الوقوف و مازال تشخيص استرواح الصفاق غير مؤكد ف عندها يجب طلب تصوير مقطعي محوسب ل البطن', 'يعرض التصوير المقطعي المحوسب المريض ل كمية أكبر من الإشعاع لكنه سيظهر ب وضوح وجود الغاز السهم الأبيض و قد يشخص السبب المستبطن', 'و حاليا ف إن معظم المرضى غير المستقرين الذين يشك لديهم ب استرواح الصفاق يخضعون مباشرة ل التصوير المقطعي المحوسب', 'مثال مثال الصورة صورتان شعاعيتان ل البطن متطابقتان تظهران استرواح صفاق كبير', 'يوجد عرى معوية يحددها خارجيا غاز من كلا الجانبين ب شكل يتوافق مع علامة']


In [None]:
print("Number of examples after splitting by periods:", len(examples))

Number of examples after splitting by periods: 1401


In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

output_dir = "/content/fine_tuned_arabertv02"
model = AutoModelForMaskedLM.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

loading configuration file /content/fine_tuned_arabertv02/config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-base-arabertv02",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}

loading weights file /content/fine_tuned_arabertv02/pytorch_model.bin
All model checkpoint weights were used when initializing BertForMaskedLM.

All the weights of BertForMaskedLM were initialized from the model checkpoint at /content/fine_tuned_arabertv02.
If y

In [None]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_eval_batch_size=8,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)

eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Evaluation *****
  Num examples = 143
  Batch size = 8


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Evaluation results: {'eval_loss': 2.3693408966064453, 'eval_runtime': 7.7236, 'eval_samples_per_second': 18.515, 'eval_steps_per_second': 2.331}


In [None]:
from transformers import pipeline

fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

In [None]:
import torch
from transformers import pipeline
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)

In [None]:
masked_input = "في العصور الوسطى ، كانت [MASK] تعتبر مركزًا للعلم والثقافة والفنون والتجارة ، حيث توفر ملتقى للمفكرين والعلماء والفنانين من مختلف أنحاء العالم."
result = fill_mask(masked_input)
print("Result:", result)

Result: [{'sequence': 'في العصور الوسطى ، كانت المدينة تعتبر للعلم والثقافة والفنون والتجارة ، حيث توفر ملتقى للمفكرين والعلماء والفنانين من مختلف أنحاء العالم.', 'score': 0.12235911190509796, 'token': 1665, 'token_str': 'المدينة'}, {'sequence': 'في العصور الوسطى ، كانت القسطنطينية تعتبر للعلم والثقافة والفنون والتجارة ، حيث توفر ملتقى للمفكرين والعلماء والفنانين من مختلف أنحاء العالم.', 'score': 0.07868682593107224, 'token': 47091, 'token_str': 'القسطنطينية'}, {'sequence': 'في العصور الوسطى ، كانت روما تعتبر للعلم والثقافة والفنون والتجارة ، حيث توفر ملتقى للمفكرين والعلماء والفنانين من مختلف أنحاء العالم.', 'score': 0.07445629686117172, 'token': 7878, 'token_str': 'روما'}, {'sequence': 'في العصور الوسطى ، كانت اسطنبول تعتبر للعلم والثقافة والفنون والتجارة ، حيث توفر ملتقى للمفكرين والعلماء والفنانين من مختلف أنحاء العالم.', 'score': 0.06781637668609619, 'token': 13965, 'token_str': 'اسطنبول'}, {'sequence': 'في العصور الوسطى ، كانت قرطبة تعتبر للعلم والثقافة والفنون والتجارة ، حيث توف