In [205]:
import re

def tokenize(text):
    # Tokenize the Arabic text
    return re.findall(r'\b\w+\b', text)

def remove_diacritics(text):
    # Remove diacritics (vowel marks) from the Arabic text
    diacritic_marks = ['َ', 'ُ', 'ِ', 'ْ', 'ّ','ٌ','ً','ٍ']
    text=text.replace('أ', 'ا').replace('إ', 'ا').replace('آ', 'ا')
    for mark in diacritic_marks:
        text = text.replace(mark, '')
    return text


def normalize(word):
    # Remove common Arabic prefixes and suffixes
    prefixes = ['ال', 'و', 'ف', 'ب', 'ك', 'ل', 'ت']
    suffixes = ['ة', 'ي', 'ه', 'نا', 'كم', 'هما', 'كن', 'ن', 'ا', 'ت']
    prefixes.reverse()
    suffixes.reverse()
    for prefix in prefixes:
        if word.startswith(prefix):
            word = word[len(prefix):]
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[:-len(suffix)]
    word=word.replace('ى', 'ي')
    return word

def apply_prefix_rules(word):
    # Apply the ISRI stemmer prefix rules to the word
    # This involves removing additional prefixes based on the context of the word
    # You can find the complete set of prefix rules in the ISRI stemmer paper
    if word.startswith('و'):
        if word[1:].startswith(('ل', 'س', 'ي', 'ن')):
            word = word[1:]
    if word.startswith(('الم', 'ال')):
        word = word[3:]
    if word.startswith('لل'):
        word = word[2:]
    return word

def apply_suffix_rules(word):
    # Apply the ISRI stemmer suffix rules to the word
    # This involves removing additional suffixes based on the context of the word
    # You can find the complete set of suffix rules in the ISRI stemmer paper
    if word.endswith(('ات', 'ون', 'ين', 'تن', 'يه', 'ة','و')):
        word = word[:-1]
    if word.endswith('ان'):
        if len(word) > 4:
            word = word[:-2]
        else:
            word = word[:-1]
    if word.endswith(('تما', 'تان', 'كما', 'هما', 'نا')):
        word = word[:-2]
    if word.endswith(('وا', 'يا', 'ا')):
        word = word[:-1]
    return word


def stem(text):
    # Tokenize the text into individual words
    tokens = tokenize(text)
    
    # Stem each token
    stemmed_tokens = []
    for token in tokens:
      if(token[-1]=='ى'):
        token = remove_diacritics(token)
        token = normalize(token)
        token = apply_prefix_rules(token)
      else: 
        token = remove_diacritics(token)
        token = normalize(token)
        token = apply_prefix_rules(token)
        token = apply_suffix_rules(token)

        # Add the stemmed token to the list
      stemmed_tokens.append(token)
     
    # Return the stemmed text
    return ' '.join(stemmed_tokens) 

In [207]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
arabic_stop_words = set(stopwords.words('arabic'))

text = 'الإيمان  ... وأثره على الصحة النفسية'
words = text.split()


filtered_words = [stem(word) for word in words if not word in arabic_stop_words]
for word in filtered_words:
  if word =='':
    filtered_words.remove(word)

print(filtered_words)

#stem(filtered_words)

['ايم', 'اثر', 'صح', 'نفسي']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
