In [1]:
import pandas as pd
import pyarabic.araby as araby
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sicom\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Data

In [2]:
with open("./dataset/val.txt", "r", encoding="utf-8") as f:
    val = f.readlines()


def clean_text(text):
    text = re.sub(r"[]{}[:()'\"]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"،", ",", text)
    text = re.sub(r"؟", "?", text)
    text = re.sub(r"؛", ";", text)
    return text

val = " ".join([clean_text(text) for text in val[:100]])
val = sent_tokenize(val)
sentences = []

for sent in val:
    sentences.extend(araby.sentence_tokenize(sent))
df = pd.DataFrame(sentences)
print(df.shape)
df.head()

(347, 1)


Unnamed: 0,0
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...


In [3]:
print(df[0][0])

  قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .


## Pre-Processing

#### Text Cleaning

In [4]:
# import unicodedata
import pickle



def condition(word):
    # if len(word) == 1:
    #     if word == "و":
    #         return True
    #     return False
    return araby.is_arabicrange(word)


diacritic_to_id = pickle.load(open("./assets/diacritic2id.pickle", "rb"))
arabic_letters = list(pickle.load(open("./assets/arabic_letters.pickle", "rb")))



def extract_diacritics(text):
    diacritics_list = []
    for word in text:
        word_list = []
        for idx, char in enumerate(word):        
            if char in diacritic_to_id:
                continue
            if char not in arabic_letters:
                continue
            
            if idx + 2 >= len(word): # last char
                if idx == len(word) - 1:
                    word_list.append(diacritic_to_id[""])
                    break
                if word[idx+1] in diacritic_to_id:
                    word_list.append(diacritic_to_id[word[idx+1]])
                    break
                else:
                    word_list.append(diacritic_to_id[""])
                    continue

            
            if word[idx+1] in diacritic_to_id and word[idx+2] in diacritic_to_id:
                if diacritic_to_id[word[idx+1]] == 7:
                    word_list.append(diacritic_to_id[word[idx+2]]+8) 
                else:
                    word_list.append(diacritic_to_id[""])    
            elif word[idx+1] in diacritic_to_id and word[idx+2] not in diacritic_to_id:
                word_list.append(diacritic_to_id[word[idx+1]])
            else:
                word_list.append(diacritic_to_id[""])
        diacritics_list.append(word_list)
                
        

    return diacritics_list


df["tokenized"] = df[0].apply(lambda sent: araby.tokenize(sent, conditions=condition))
df["tokenized_cleaned"] = df[0].apply(lambda sent: araby.tokenize(sent, conditions=condition, morphs=araby.strip_tashkeel))
df["diacritics"] = df["tokenized"].apply(extract_diacritics)
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"[قَوْلُهُ, وَلَا, تُكْرَهُ, ضِيَافَتُهُ]","[قوله, ولا, تكره, ضيافته]","[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0..."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[الْفَرْقُ, الثَّالِثُ, وَالثَّلَاثُونَ, بَيْن...","[الفرق, الثالث, والثلاثون, بين, قاعدة, تقدم, ا...","[[14, 6, 0, 6, 2], [14, 14, 8, 14, 4, 2], [0, ..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[قَوْلُهُ, وَهُوَ, أَيْ, الْبَيْعُ, بِالْمَعْن...","[قوله, وهو, أي, البيع, بالمعنى, الثاني, الذي, ...","[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6..."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[إذْ, الْمُقَابَلَةُ, لَا, تَصْدُقُ, عَلَى, ال...","[إذ, المقابلة, لا, تصدق, على, العقد, فكان, الم...","[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],..."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[وَقَدْ, يُجْعَلُ, كَلَامُهُ, عَلَى, حَذْفِ, م...","[وقد, يجعل, كلامه, على, حذف, مضاف, أي, ذو, مقا...","[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0..."


In [5]:
extract_diacritics(["وَالثَّلَاثُونَ"])

[[0, 14, 14, 8, 0, 14, 2, 14, 0]]

#### Feauture extraction

In [6]:
def diacritics_probability_per_char(tokenized_cleaned, diacritics): 
    probability = {}
    
    for i in range(36):
        probability[i] = []
        for j in range(15):
            probability[i].append(0)
            
    counts = [0]*36
            
    for i in range(len(tokenized_cleaned)):
        for j in range(len(tokenized_cleaned[i])):
            for k in range(len(tokenized_cleaned[i][j])):
                probability[arabic_letters.index(tokenized_cleaned[i][j][k])][diacritics[i][j][k]] += 1
                counts[arabic_letters.index(tokenized_cleaned[i][j][k])] += 1
            
            
    for i in range(36):
        for j in range(15):
            probability[i][j] /= counts[i]
        
    return probability

def create_bef_after(text):
    input_vectors = []
    if len(text) == 1:
        letter_vec = []
        letter_vec.append(36)
        letter_vec.append(36)
        input_vectors.append(letter_vec)
        return input_vectors
    for i in range(len(text)):
        letter_vec = []
        if i == 0:
            letter_vec.append(36)
            letter_vec.append(arabic_letters.index(text[i+1]))
            input_vectors.append(letter_vec)
            continue
        elif i == len(text)-1:
            letter_vec.append(arabic_letters.index(text[i-1]))

            letter_vec.append(36)
            input_vectors.append(letter_vec)
            break
        letter_vec.append(arabic_letters.index(text[i-1]))

        letter_vec.append(arabic_letters.index(text[i+1]))
        input_vectors.append(letter_vec)
    
    
    return input_vectors

In [7]:
prob_per_char = pickle.load(open("./assets/prob_dict.pkl", "rb"))
df["prob_per_char"] = df["tokenized_cleaned"].apply(lambda x: [[prob_per_char[list(arabic_letters).index(char)] for char in word] for word in x])
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics,prob_per_char
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"[قَوْلُهُ, وَلَا, تُكْرَهُ, ضِيَافَتُهُ]","[قوله, ولا, تكره, ضيافته]","[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0...","[[[0.3664743069249507, 0.011585082937014268, 0..."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[الْفَرْقُ, الثَّالِثُ, وَالثَّلَاثُونَ, بَيْن...","[الفرق, الثالث, والثلاثون, بين, قاعدة, تقدم, ا...","[[14, 6, 0, 6, 2], [14, 14, 8, 14, 4, 2], [0, ...","[[[0.4327186818027109, 0.007387831100967025, 0..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[قَوْلُهُ, وَهُوَ, أَيْ, الْبَيْعُ, بِالْمَعْن...","[قوله, وهو, أي, البيع, بالمعنى, الثاني, الذي, ...","[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6...","[[[0.3664743069249507, 0.011585082937014268, 0..."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[إذْ, الْمُقَابَلَةُ, لَا, تَصْدُقُ, عَلَى, ال...","[إذ, المقابلة, لا, تصدق, على, العقد, فكان, الم...","[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],...","[[[0.25205920929361064, 0.0018512272812441447,..."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[وَقَدْ, يُجْعَلُ, كَلَامُهُ, عَلَى, حَذْفِ, م...","[وقد, يجعل, كلامه, على, حذف, مضاف, أي, ذو, مقا...","[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0...","[[[0.4627733688738111, 0.005755134185497062, 0..."


In [8]:
df["bef_after"] = df["tokenized_cleaned"].apply(lambda x: [create_bef_after(word) for word in x])

In [9]:
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics,prob_per_char,bef_after
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"[قَوْلُهُ, وَلَا, تُكْرَهُ, ضِيَافَتُهُ]","[قوله, ولا, تكره, ضيافته]","[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 31], [6, 19], [31, 3], [19, 36]], [[36,..."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[الْفَرْقُ, الثَّالِثُ, وَالثَّلَاثُونَ, بَيْن...","[الفرق, الثالث, والثلاثون, بين, قاعدة, تقدم, ا...","[[14, 6, 0, 6, 2], [14, 14, 8, 14, 4, 2], [0, ...","[[[0.4327186818027109, 0.007387831100967025, 0...","[[[36, 19], [25, 10], [19, 27], [10, 6], [27, ..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[قَوْلُهُ, وَهُوَ, أَيْ, الْبَيْعُ, بِالْمَعْن...","[قوله, وهو, أي, البيع, بالمعنى, الثاني, الذي, ...","[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 31], [6, 19], [31, 3], [19, 36]], [[36,..."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[إذْ, الْمُقَابَلَةُ, لَا, تَصْدُقُ, عَلَى, ال...","[إذ, المقابلة, لا, تصدق, على, العقد, فكان, الم...","[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],...","[[[0.25205920929361064, 0.0018512272812441447,...","[[[36, 7], [0, 36]], [[36, 19], [25, 11], [19,..."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[وَقَدْ, يُجْعَلُ, كَلَامُهُ, عَلَى, حَذْفِ, م...","[وقد, يجعل, كلامه, على, حذف, مضاف, أي, ذو, مقا...","[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0...","[[[0.4627733688738111, 0.005755134185497062, 0...","[[[36, 6], [31, 13], [6, 36]], [[36, 22], [20,..."


#### Featuers Concatination

In [10]:
import numpy as np
def concat_features(tokenized_cleaned, prob, bef_after):
    features = []
    for idx, word in enumerate(tokenized_cleaned):
        word_features = []
        for letter_idx, letter in enumerate(word):
            letter_id = arabic_letters.index(letter)
            feature = np.zeros(36)
            feature[letter_id] = 1
            feature = list(feature)
            feature.extend(prob[idx][letter_idx])
            feature.extend(bef_after[idx][letter_idx])
            word_features.append(feature)
        features.append(word_features)
    return features

df["features"] = df.apply(lambda x: concat_features(x["tokenized_cleaned"], x["prob_per_char"], x["bef_after"]), axis=1)
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics,prob_per_char,bef_after,features
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"[قَوْلُهُ, وَلَا, تُكْرَهُ, ضِيَافَتُهُ]","[قوله, ولا, تكره, ضيافته]","[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 31], [6, 19], [31, 3], [19, 36]], [[36,...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0..."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[الْفَرْقُ, الثَّالِثُ, وَالثَّلَاثُونَ, بَيْن...","[الفرق, الثالث, والثلاثون, بين, قاعدة, تقدم, ا...","[[14, 6, 0, 6, 2], [14, 14, 8, 14, 4, 2], [0, ...","[[[0.4327186818027109, 0.007387831100967025, 0...","[[[36, 19], [25, 10], [19, 27], [10, 6], [27, ...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[قَوْلُهُ, وَهُوَ, أَيْ, الْبَيْعُ, بِالْمَعْن...","[قوله, وهو, أي, البيع, بالمعنى, الثاني, الذي, ...","[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 31], [6, 19], [31, 3], [19, 36]], [[36,...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0..."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[إذْ, الْمُقَابَلَةُ, لَا, تَصْدُقُ, عَلَى, ال...","[إذ, المقابلة, لا, تصدق, على, العقد, فكان, الم...","[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],...","[[[0.25205920929361064, 0.0018512272812441447,...","[[[36, 7], [0, 36]], [[36, 19], [25, 11], [19,...","[[[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[وَقَدْ, يُجْعَلُ, كَلَامُهُ, عَلَى, حَذْفِ, م...","[وقد, يجعل, كلامه, على, حذف, مضاف, أي, ذو, مقا...","[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0...","[[[0.4627733688738111, 0.005755134185497062, 0...","[[[36, 6], [31, 13], [6, 36]], [[36, 22], [20,...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [11]:
df.features[0][0]

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3664743069249507,
  0.011585082937014268,
  0.2506089780767892,
  0.00929416541004524,
  0.09916193017051386,
  0.010671615821830413,
  0.14548776244055214,
  0.0003334879944322004,
  0.08128407377334416,
  4.349843405637397e-05,
  0.014803967057185941,
  0.00013049530216912192,
  0.0061042802459111475,
  7.249739009395662e-05,
  0.00394385802111124,
  36,
  31],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.4627733688738111,
  0.005755134185497062,
  0.15102683709941236,
  0.004058884109771612,
  0.13836554189131883,
  0.008723

## Save Processed data

In [12]:
df.to_csv("processed/train_shwya.csv", index=False)

In [40]:
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics,prob_per_char,bef_after,features
0,- حَدَّثَنَا أَحْمَدُ بْنُ عَبْدِ الْمَلِكِ حَ...,"[حَدَّثَنَا, أَحْمَدُ, بْنُ, عَبْدِ, الْمَلِكِ...","[حدثنا, أحمد, بن, عبد, الملك, حدثنا, محمد, بن,...","[[0, 8, 0, 0, 14], [0, 6, 0, 2], [6, 2], [0, 6...","[[[0.2501540319400631, 0.006689093717139853, 0...","[[[36, 11], [2, 0], [11, 29], [0, 24], [29, 36...","[[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
1,/ وَقَالَ خَلِيفَةُ عَاشَ إِحْدَى وَخَمْسِين...,"[وَقَالَ, خَلِيفَةُ, عَاشَ, إِحْدَى, وَخَمْسِي...","[وقال, خليفة, عاش, إحدى, وخمسين, سنة]","[[0, 0, 14, 0], [0, 4, 14, 0, 2], [0, 14, 0], ...","[[[0.35878333333333334, 0.05773333333333333, 0...","[[[36, 18], [15, 24], [18, 13], [24, 36]], [[3...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
2,- حَدَّثَنَا سُفْيَانُ حَدَّثَنِي مُسْلِمُ بْن...,"[حَدَّثَنَا, سُفْيَانُ, حَدَّثَنِي, مُسْلِمُ, ...","[حدثنا, سفيان, حدثني, مسلم, بن, أبي, مريم, عن,...","[[0, 8, 0, 0, 14], [2, 6, 0, 14, 2], [0, 8, 0,...","[[[0.2501540319400631, 0.006689093717139853, 0...","[[[36, 11], [2, 0], [11, 29], [0, 24], [29, 36...","[[[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
3,"لِأَنَّهُ مَعْصِيَةٌ ,","[لِأَنَّهُ, مَعْصِيَةٌ]","[لأنه, معصية]","[[4, 0, 8, 2], [0, 6, 4, 0, 3]]","[[[0.003846002388984419, 0.0, 2.62525760340233...","[[[36, 19], [13, 29], [19, 32], [29, 36]], [[3...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."
4,وَكَذَلِكَ إذَا اسْتَأْجَرَهَا ذِمِّيٌّ مِنْ ذ...,"[وَكَذَلِكَ, إذَا, اسْتَأْجَرَهَا, ذِمِّيٌّ, م...","[وكذلك, إذا, استأجرها, ذمي, من, ذمي]","[[0, 0, 0, 4, 0], [14, 0, 14], [14, 6, 0, 6, 0...","[[[0.35878333333333334, 0.05773333333333333, 0...","[[[36, 9], [15, 23], [9, 13], [23, 9], [13, 36...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [3]:
import pickle

diacritic_to_id = pickle.load(open("./assets/diacritic2id.pickle", "rb"))

[ (" " + x, idx) for idx, x in enumerate(diacritic_to_id.keys())]

[(' َ', 0),
 (' ً', 1),
 (' ُ', 2),
 (' ٌ', 3),
 (' ِ', 4),
 (' ٍ', 5),
 (' ْ', 6),
 (' ّ', 7),
 (' َّ', 8),
 (' ًّ', 9),
 (' ُّ', 10),
 (' ٌّ', 11),
 (' ِّ', 12),
 (' ٍّ', 13),
 (' ', 14)]

In [3]:
#### Extra features

In [16]:
import torch
import torch.nn as nn
import numpy as np
from tqdm import tqdm
import pandas as pd
import itertools
from transformers import AutoTokenizer, AutoModel

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

# ignore
class Model(nn.Module):
  def __init__(self, input_size=768, hidden_size=50, n_classes=100):

    super(Model, self).__init__()

    self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, dropout=0.2)

    self.linear = nn.Linear(hidden_size, n_classes)

  def forward(self, X, hidden=None):

    final_output, hidden = self.lstm(X, hidden)
    final_output = self.linear(final_output)

    return final_output, hidden

In [17]:
import pickle
context_model = pickle.load(open("./models/context_model.pkl", "rb")).to(device)
context_model.lstm.flatten_parameters()
tokenizer = AutoTokenizer.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca')
embedder = AutoModel.from_pretrained('CAMeL-Lab/bert-base-arabic-camelbert-ca').to(device)
tqdm.pandas()

In [18]:
def get_contextualized_embeddings(sent):
    if len(sent) == 0:
        return []
    tokens = tokenizer(sent, return_tensors="pt", padding=True)
    tokens = tokens.to(device)
    embeddings = embedder(**tokens).last_hidden_state[:, 1, :]
    hidden_layers = []    
    with torch.no_grad():
        hidden = None
        for embedding in embeddings:
            _, hidden = context_model(embedding.unsqueeze(0), hidden)
            hidden_layers.append(hidden[0])

    return list(zip(sent, hidden_layers))

df["tokens_with_context"] = df["tokenized_cleaned"].progress_apply(get_contextualized_embeddings)

  0%|          | 0/347 [00:00<?, ?it/s]

100%|██████████| 347/347 [00:21<00:00, 15.88it/s]


In [20]:
df.head()

Unnamed: 0,0,tokenized,tokenized_cleaned,diacritics,prob_per_char,bef_after,features,tokens_with_context
0,قَوْلُهُ وَلَا تُكْرَهُ ضِيَافَتُهُ .,"[قَوْلُهُ, وَلَا, تُكْرَهُ, ضِيَافَتُهُ]","[قوله, ولا, تكره, ضيافته]","[[0, 6, 2, 2], [0, 0, 14], [2, 6, 0, 2], [4, 0...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 28], [6, 35], [28, 11], [35, 36]], [[36...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0...","[(قوله, [tensor([-5.7860e-05, 7.6122e-01, 3...."
1,الْفَرْقُ الثَّالِثُ وَالثَّلَاثُونَ بَيْنَ قَ...,"[الْفَرْقُ, الثَّالِثُ, وَالثَّلَاثُونَ, بَيْن...","[الفرق, الثالث, والثلاثون, بين, قاعدة, تقدم, ا...","[[14, 6, 0, 6, 2], [14, 14, 8, 14, 4, 2], [0, ...","[[[0.4327186818027109, 0.007387831100967025, 0...","[[[36, 35], [25, 4], [35, 21], [4, 6], [21, 36...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[(الفرق, [tensor([-8.2446e-06, 7.6140e-01, 2..."
2,قَوْلُهُ وَهُوَ أَيْ الْبَيْعُ بِالْمَعْنَى ال...,"[قَوْلُهُ, وَهُوَ, أَيْ, الْبَيْعُ, بِالْمَعْن...","[قوله, وهو, أي, البيع, بالمعنى, الثاني, الذي, ...","[[0, 6, 2, 2], [0, 2, 0], [0, 6], [14, 6, 0, 6...","[[[0.3664743069249507, 0.011585082937014268, 0...","[[[36, 28], [6, 35], [28, 11], [35, 36]], [[36...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0...","[(قوله, [tensor([-5.7860e-05, 7.6122e-01, 3...."
3,إذْ الْمُقَابَلَةُ لَا تَصْدُقُ عَلَى الْعَقْد...,"[إذْ, الْمُقَابَلَةُ, لَا, تَصْدُقُ, عَلَى, ال...","[إذ, المقابلة, لا, تصدق, على, العقد, فكان, الم...","[[14, 6], [14, 6, 2, 0, 14, 0, 0, 2], [0, 14],...","[[[0.6461516586641, 0.003685920222248958, 0.05...","[[[36, 32], [3, 36]], [[36, 35], [25, 19], [35...","[[[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[(إذ, [tensor([-2.3214e-05, 7.6133e-01, 4.03..."
4,وَقَدْ يُجْعَلُ كَلَامُهُ عَلَى حَذْفِ مُضَافٍ...,"[وَقَدْ, يُجْعَلُ, كَلَامُهُ, عَلَى, حَذْفِ, م...","[وقد, يجعل, كلامه, على, حذف, مضاف, أي, ذو, مقا...","[[0, 0, 6], [2, 6, 0, 2], [0, 0, 14, 2, 2], [0...","[[[0.412407786468069, 0.006399887474505943, 0....","[[[36, 6], [28, 16], [6, 36]], [[36, 34], [5, ...","[[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[(وقد, [tensor([-3.2933e-04, 7.6013e-01, 7.6..."
