# **DadmaTools:  A Python NLP Library for Persian**
1. Download the toolkit via `pip`

In [None]:
!pip install dadmatools

DadmaTools has different NLP models: *normalizer, tokenizer, lemmatizer, pos tagger, dependancy parser, and constituency parser.*

The normalizer can be used with the code below:

In [17]:
from dadmatools.normalizer import Normalizer

normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=False,
    remove_html=False,
    remove_stop_word=False,
    replace_email_with="<EMAIL>",
    replace_number_with="عدد",
    replace_url_with="",
    replace_mobile_number_with=None,
    replace_emoji_with=None,
    replace_home_number_with=None
)

text = "من دیروز12 مدرسه رفتم"
print('input text : ', text)
print('output text when replace emails and remove urls : ', normalizer.normalize(text))

input text :  من دیروز12 مدرسه رفتم
output text when replace emails and remove urls :  من دیروز12 مدرسه رفتم


**Other NLP models can be used via pipeline. Each task has its own abbreviation.**

In [3]:
from dadmatools.pipeline.informal2formal.main import Informal2Formal
translator = Informal2Formal()

print(translator.translate('اینو اگه خواستین میتونین واسه تبدیل تست کنین '))

Downloading file cache/dadmatools/fa_tokenizer.pt: : 639kB [00:01, 493kB/s]                           


3gram.bin: 2.30GB [01:17, 31.9MB/s]
assets.pkl: 3.14MB [00:00, 19.3MB/s]
irregular_verb_mapper.csv: 100%|██████████| 1.57k/1.57k [00:00<00:00, 4.90MB/s]
verbs.csv: 100%|██████████| 39.4k/39.4k [00:00<00:00, 206kB/s]
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
 این را اگر خواستید می‌توانید برای تبدیل تست بکنید


In [4]:
import dadmatools.pipeline.language as language

# as tokenizer is the default tool, it will be loaded even without calling
# pips = 'lem,pos,ner,dep,spellchecker,kasreh,sent,itf'

Downloading file cache/dadmatools/fa_tokenizer.pt: : 639kB [00:02, 242kB/s]                         


In [17]:
pips = 'lem'
nlp_lem = language.Pipeline(pips)

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading tagger for persian
Loading multi-word expander for persian
Loading lemmatizer for persian
Active language: persian


In [27]:
nlp_lem('فهمیدم')

{'sentences': [{'id': 1,
   'tokens': [{'id': 1,
     'text': 'فهمیدم',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 0,
     'deprel': 'root',
     'lemma': 'فهمیدم'}]}],
 'lang': 'persian'}

In [28]:
pips = 'pos'
nlp_pos = language.Pipeline(pips)

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading tagger for persian
Loading multi-word expander for persian
Active language: persian


In [30]:
nlp_pos('ناگهان باران بارید')

{'sentences': [{'id': 1,
   'tokens': [{'id': 1,
     'text': 'ناگهان',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 0,
     'deprel': 'root'},
    {'id': 2,
     'text': 'باران',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 1,
     'deprel': 'nsubj'},
    {'id': 3,
     'text': 'بارید',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 1,
     'deprel': 'root'}]}],
 'lang': 'persian'}

In [34]:
pips = 'ner'
nlp_ner = language.Pipeline(pips)

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading multi-word expander for persian
Loading NER tagger for persian
Active language: persian


In [41]:
nlp_ner('دکتر احمدی، رئیس دانشگاه تهران، در کنفرانس هوش مصنوعی که در هتل آزادی برگزار شد، سخنرانی کرد.')

{'sentences': [{'id': 1,
   'tokens': [{'id': 1, 'text': 'دکتر', 'ner': 'O'},
    {'id': 2, 'text': 'احمدی', 'ner': 'S-PER'},
    {'id': 3, 'text': '،', 'ner': 'O'},
    {'id': 4, 'text': 'رئیس', 'ner': 'O'},
    {'id': 5, 'text': 'دانشگاه', 'ner': 'B-ORG'},
    {'id': 6, 'text': 'تهران', 'ner': 'E-ORG'},
    {'id': 7, 'text': '،', 'ner': 'O'},
    {'id': 8, 'text': 'در', 'ner': 'O'},
    {'id': 9, 'text': 'کنفرانس', 'ner': 'O'},
    {'id': 10, 'text': 'هوش', 'ner': 'O'},
    {'id': 11, 'text': 'مصنوعی', 'ner': 'O'},
    {'id': 12, 'text': 'که', 'ner': 'O'},
    {'id': 13, 'text': 'در', 'ner': 'O'},
    {'id': 14, 'text': 'هتل', 'ner': 'B-LOC'},
    {'id': 15, 'text': 'آزادی', 'ner': 'E-LOC'},
    {'id': 16, 'text': 'برگزار', 'ner': 'O'},
    {'id': 17, 'text': 'شد', 'ner': 'O'},
    {'id': 18, 'text': '،', 'ner': 'O'},
    {'id': 19, 'text': 'سخنرانی', 'ner': 'O'},
    {'id': 20, 'text': 'کرد', 'ner': 'O'},
    {'id': 21, 'text': '.', 'ner': 'O'}]}],
 'lang': 'persian'}

In [None]:
pips = 'dep'
nlp_dep = language.Pipeline(pips)

In [6]:
nlp_dep('حراست دانشگاه سمنان')

{'sentences': [{'id': 1,
   'tokens': [{'id': 1,
     'text': 'حراست',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 0,
     'deprel': 'root'},
    {'id': 2,
     'text': 'دانشگاه',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 1,
     'deprel': 'nmod:poss'},
    {'id': 3,
     'text': 'سمنان',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 1,
     'deprel': 'nmod:poss'}]}],
 'lang': 'persian'}

In [9]:
pips = 'spellchecker'
nlp_spell = language.Pipeline(pips)

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading multi-word expander for persian


config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

state_dict_nevise.pt:   0%|          | 0.00/959M [00:00<?, ?B/s]

vocab.pkl:   0%|          | 0.00/3.80M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

Active language: persian


In [14]:
nlp_spell('دیشپ گذفتار بودم نتونستم به دیرنت بیام')

1it [00:00,  2.76it/s]


{'spellchecker': {'orginal': 'دیشپ گذفتار بودم نتونستم به دیرنت بیام',
  'corrected': 'پیش گرفتار بودم نتونستم به دیرنت بیام',
  'checked_words': [('دیشپ', 'پیش'), ('گذفتار', 'گرفتار')]},
 'sentences': [{'id': 1,
   'tokens': [{'id': 1, 'text': 'دیشپ'},
    {'id': 2, 'text': 'گذفتار'},
    {'id': 3, 'text': 'بودم'},
    {'id': 4, 'text': 'نتونستم'},
    {'id': 5, 'text': 'به'},
    {'id': 6, 'text': 'دیرنت'},
    {'id': 7, 'text': 'بیام'}]}],
 'lang': 'persian'}

In [10]:
pips = 'kasreh'
nlp_kasreh = language.Pipeline(pips)

persian.kasreh.mdl:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

persian.kasreh-vocab.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading multi-word expander for persian
Loading Kasreh tagger for persian
Active language: persian


In [21]:
nlp_kasreh('کلید در را برداشتم و به حراست دانشگاه علم و صنعت  تحویل دادم.')

{'sentences': [{'id': 1,
   'tokens': [{'id': 1, 'text': 'کلید', 'kasreh': 'O'},
    {'id': 2, 'text': 'در', 'kasreh': 'O'},
    {'id': 3, 'text': 'را', 'kasreh': 'O'},
    {'id': 4, 'text': 'برداشتم', 'kasreh': 'O'},
    {'id': 5, 'text': 'و', 'kasreh': 'O'},
    {'id': 6, 'text': 'به', 'kasreh': 'O'},
    {'id': 7, 'text': 'حراست', 'kasreh': 'S-kasreh'},
    {'id': 8, 'text': 'دانشگاه', 'kasreh': 'O'},
    {'id': 9, 'text': 'علم', 'kasreh': 'O'},
    {'id': 10, 'text': 'و', 'kasreh': 'O'},
    {'id': 11, 'text': 'صنعت', 'kasreh': 'O'},
    {'id': 12, 'text': 'تحویل', 'kasreh': 'O'},
    {'id': 13, 'text': 'دادم', 'kasreh': 'O'},
    {'id': 14, 'text': '.', 'kasreh': 'O'}]}],
 'lang': 'persian'}

In [22]:
pips = 'sent'
nlp_sent = language.Pipeline(pips)

persian.sent.mdl:   0%|          | 0.00/10.4M [00:00<?, ?B/s]

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading multi-word expander for persian


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Active language: persian


In [28]:
nlp_sent('با اینکه کتاب بسیار خوب نگارش شده بود و از نظر خیلی‌ها شاهکاری در ادبیات بود، نتوانستم با آن ارتباط بگیرم.')['sentiment']

[{'label': 'positive', 'score': 0.7725887298583984}]

In [29]:
pips = 'itf'
nlp_itf = language.Pipeline(pips)

Loading pretrained XLM-Roberta, this may take a while...
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Loading tokenizer for persian
Loading multi-word expander for persian
3gram.bin: 2.30GB [02:12, 18.6MB/s]
assets.pkl: 3.14MB [00:01, 2.60MB/s]
irregular_verb_mapper.csv: 100%|██████████| 1.57k/1.57k [00:00<00:00, 546kB/s]
verbs.csv: 100%|██████████| 39.4k/39.4k [00:00<00:00, 926kB/s]
Model fa_tokenizer exists in cache/dadmatools/fa_tokenizer.pt
Active language: persian


In [32]:
nlp_itf('خیلی خستم نمیتونم کار کنم.')

{'itf': ' خیلی خستم نمی\u200cتوانم کار بکنم .',
 'sentences': [{'id': 1,
   'tokens': [{'id': 1, 'text': 'خیلی'},
    {'id': 2, 'text': 'خستم'},
    {'id': 3, 'text': 'نمیتونم'},
    {'id': 4, 'text': 'کار'},
    {'id': 5, 'text': 'کنم'},
    {'id': 6, 'text': '.'}]}],
 'lang': 'persian'}

In [5]:
text = 'من صادق جعفری‌زاده به عنوان توسعه‌دهنده دادماتولز از شرکت دادماتک هستم. من به لوزامبورگ خواهم رفت.'
doc = nlp(text)
doc

1it [00:00,  2.29it/s]


{'spellchecker': {'orginal': 'من صادق جعفری\u200cزاده به عنوان توسعه\u200cدهنده دادماتولز از شرکت دادماتک هستم. من به لوزامبورگ خواهم رفت.',
  'corrected': 'من صادق جعفری\u200cزاده به عنوان توسعه\u200cدهنده دادماتولز از شرکت دادماتک هستم.من به لوکزامبورگ خواهم رفت.',
  'checked_words': [('لوزامبورگ', 'لوکزامبورگ')]},
 'itf': ' من صادق جعفری زاده به عنوان توسعه\u200cدهنده دادماتولز از شرکت دادماتک هستم . من به لوزامبورگ خواهم رفت .',
 'sentences': [{'id': 1,
   'tokens': [{'id': 1,
     'text': 'من',
     'upos': 'PRON',
     'xpos': 'PRO',
     'feats': 'Number=Sing|Person=1|PronType=Prs',
     'head': 2,
     'deprel': 'nsubj',
     'lemma': 'من',
     'ner': 'O',
     'kasreh': 'O'},
    {'id': 2,
     'text': 'صادق',
     'upos': 'NOUN',
     'xpos': 'N_SING',
     'feats': 'Number=Sing',
     'head': 0,
     'deprel': 'root',
     'lemma': 'صادق',
     'ner': 'B-PER',
     'kasreh': 'S-kasreh'},
    {'id': 3,
     'text': 'جعفری\u200cزاده',
     'upos': 'NOUN',
     'xpos': 'N_SING

# Loading Persian NLP Datasets

In [6]:
from dadmatools.datasets import get_all_datasets_info, get_dataset_info
from dadmatools.datasets import ARMAN
from dadmatools.datasets import TEP
from dadmatools.datasets import PerSentLexicon
from dadmatools.datasets import FaSpell
from dadmatools.datasets import WikipediaCorpus
from dadmatools.datasets import PersianNer
from dadmatools.datasets import PersianNews
from dadmatools.datasets import PnSummary
from dadmatools.datasets import FarsTail
from dadmatools.datasets import SnappfoodSentiment
from dadmatools.datasets import get_all_datasets_info
from dadmatools.datasets import Peyma
from dadmatools.datasets import PerUDT
from dadmatools.datasets import PersianTweets
from pprint import pprint

In [7]:
pprint(get_all_datasets_info(tasks=['NER', 'Sentiment-Analysis']))

{'ARMAN': {'description': 'ARMAN dataset holds 7,682 sentences with 250,015 '
                          'sentences tagged over six different classes.\n'
                          '\n'
                          'Organization\n'
                          'Location\n'
                          'Facility\n'
                          'Event\n'
                          'Product\n'
                          'Person',
           'filenames': ['train_fold1.txt',
                         'train_fold2.txt',
                         'train_fold3.txt',
                         'test_fold1.txt',
                         'test_fold2.txt',
                         'test_fold3.txt'],
           'name': 'ARMAN',
           'size': {'test': 7680, 'train': 15361},
           'splits': ['train', 'test'],
           'task': 'NER',
           'version': '1.0.0'},
 'PersianNer': {'description': 'source: '
                               'https://github.com/Text-Mining/Persian-NER',
                'filenames'

In [8]:
pprint(get_dataset_info('PerUDT'))

{'description': 'The Persian Universal Dependency Treebank (PerUDT) is the '
                'result of automatic coversion of Persian Dependency Treebank '
                '(PerDT) with extensive manual corrections',
 'filenames': ['fa_perdt-ud-train.conllu',
               'fa_perdt-ud-dev.conllu',
               'fa_perdt-ud-test.conllu'],
 'name': 'PerUDT',
 'size': {'dev': 1456, 'test': 1455, 'train': 26196},
 'splits': ['train', 'test', 'dev'],
 'task': 'Treebank',
 'version': '1.0.0'}


In [9]:
print('*** WikipediaCorpus dataset ****')
print()
wiki = WikipediaCorpus()
print('len data ', len(wiki.data))
print()
print('sample: ', next(wiki.data))
print()
print('****** dataset details:********\n ')
print(wiki.info)

*** WikipediaCorpus dataset ****



FileURLRetrievalError: Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1jHje8Q07tQWEpt8cEpFR_TOuqjFs79Vb

but Gdown can't. Please check connections and permissions.

In [10]:
arman = ARMAN()
print('**** Arman dataset **** ')
print('splits: ', arman.info.splits)
print(len(arman.train))
print(next(arman.test))

ArmanPersoNERCorpus.zip: 100%|██████████| 1.84M/1.84M [00:00<00:00, 8.70MB/s]
**** Arman dataset **** 
splits:  ['train', 'test']
15361
[{'token': 'به', 'tag': 'O'}, {'token': 'عنوان', 'tag': 'O'}, {'token': 'مثال', 'tag': 'O'}, {'token': 'وقتی', 'tag': 'O'}, {'token': 'نشریات', 'tag': 'O'}, {'token': 'مدافع', 'tag': 'O'}, {'token': 'اصول', 'tag': 'O'}, {'token': 'و', 'tag': 'O'}, {'token': 'ارزشها', 'tag': 'O'}, {'token': 'و', 'tag': 'O'}, {'token': 'منادی', 'tag': 'O'}, {'token': 'انقلاب', 'tag': 'O'}, {'token': 'و', 'tag': 'O'}, {'token': 'اسلام', 'tag': 'O'}, {'token': 'در', 'tag': 'O'}, {'token': 'بالاترین', 'tag': 'O'}, {'token': 'درجه', 'tag': 'O'}, {'token': '،', 'tag': 'O'}, {'token': 'اولین', 'tag': 'O'}, {'token': 'و', 'tag': 'O'}, {'token': 'درشت\u200cترین', 'tag': 'O'}, {'token': 'تیتر', 'tag': 'O'}, {'token': 'نشریه', 'tag': 'O'}, {'token': 'خود', 'tag': 'O'}, {'token': 'را', 'tag': 'O'}, {'token': 'در', 'tag': 'O'}, {'token': 'صدر', 'tag': 'O'}, {'token': 'صفحه', 'tag': 

# Using Pre-Trained Persian Word Embeddings

In [11]:
from dadmatools.embeddings import get_embedding, get_all_embeddings_info, get_embedding_info

In [12]:
pprint(get_all_embeddings_info())

{'fasttext-commoncrawl-bin': {'algorithm': 'fasttext',
                              'corpus': 'CommonCrawl',
                              'desc': '',
                              'dim': 300,
                              'filename': 'cc.fa.300.bin',
                              'format': 'bin',
                              'url': 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.bin.gz'},
 'fasttext-commoncrawl-vec': {'algorithm': 'fasttext',
                              'corpus': 'CommonCrawl',
                              'desc': '',
                              'dim': 300,
                              'filename': 'cc.fa.300.vec',
                              'format': 'vec',
                              'url': 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fa.300.vec.gz'},
 'glove-wiki': {'algorithm': 'glove',
                'corpus': 'wikipedia',
                'desc': 'source: https://github.com/Text-Mining',
                'dim': 50,
      

In [None]:
pprint(get_embedding_info('glove-wiki'))

{'algorithm': 'glove',
 'corpus': 'wikipedia',
 'desc': 'source: https://github.com/Text-Mining',
 'dim': 50,
 'filename': 'vectors.txt',
 'format': 'txt',
 'url': 'https://raw.githubusercontent.com/Text-Mining/Persian-Wikipedia-Corpus/master/models/glove/vectors.zip'}


In [13]:
embedding = get_embedding('glove-wiki')
print(embedding['ابزار'])

vectors.zip: 100%|██████████| 45.9M/45.9M [00:01<00:00, 48.0MB/s]
[-0.308614 -0.168945 -2.576352  0.877447 -0.348502  0.582602  0.602845
  0.471903  0.533526  0.906185  0.907475 -0.167968 -0.095735 -0.475923
  0.276284  0.010084 -0.926263 -1.124971 -0.443414 -0.447227  0.259192
  0.078348  0.916888 -0.061847 -0.853357  0.996823 -0.26386   0.621702
  0.768682  0.250663  0.358242  0.571274 -0.321239  0.012563 -0.567481
  0.560345 -0.206234 -0.187835 -0.665903  0.234979 -0.442619  0.164727
 -0.262    -0.172979 -0.393394 -0.474647  0.480312  1.106502  0.767303
  0.046918]


In [14]:
print(embedding.embedding_text('ابزار پردازش متن فارسی'))

[ 8.2652763e-02  3.8418624e-01 -1.8762367e+00 -8.6866260e-02
  3.6461627e-01  7.5215775e-01  3.6994025e-01  4.9959701e-01
  1.2264743e-02  3.3335799e-01  5.5867076e-01  3.5873100e-01
  4.2627126e-01 -8.8378501e-01 -1.2670399e-01 -7.0495725e-01
 -6.2538046e-01 -5.5862820e-01 -3.2012752e-01 -1.8887758e-02
  2.8124401e-01  1.6167176e-01  5.9974694e-01  3.4806246e-01
 -1.4647543e-03  7.3103124e-01  1.9454075e-01  3.4274727e-01
  5.1055348e-01  5.3316355e-01  5.8826029e-01  1.2634257e+00
 -1.2206910e+00 -4.0682977e-01 -2.4609923e-01  6.5093577e-01
 -2.5686526e-01 -4.0690476e-01  4.8100728e-01  4.8069999e-02
 -6.2497050e-01 -2.3815494e-02  2.1647224e-01 -2.1010575e-01
 -8.5227352e-01 -4.0755576e-01  8.1856251e-02  1.1975710e+00
  5.1946604e-01  5.7960773e-01]


In [15]:
embedding.similarity('کتاب', 'کتب')

0.77167135

In [16]:
embedding.top_nearest('کتاب', 10)

[('کتابی', 0.9353402256965637),
 ('کتاب\u200cهای', 0.859483540058136),
 ('جلد', 0.8522471785545349),
 ('تالیف', 0.8399883508682251),
 ('نوشته', 0.8382429480552673),
 ('مقاله', 0.8335505127906799),
 ('نوشته\u200cاست', 0.8273731470108032),
 ('شرح', 0.8273376822471619),
 ('ترجمه', 0.8256694078445435),
 ('می\u200cنویسد', 0.8014416694641113)]