# Arabic grammar word

In [1]:
class ArabicMorphologicalAnalyzer:
    def __init__(self):
        # Basic patterns (أوزان) for trilateral roots
        self.patterns = {
            'regular': {
                'فَعَلَ': {'past': 'فَعَلَ', 'present': 'يَفْعَلُ', 'masdar': 'فَعْل'},
                'فَعِلَ': {'past': 'فَعِلَ', 'present': 'يَفْعَلُ', 'masdar': 'فَعَل'},
                'فَعُلَ': {'past': 'فَعُلَ', 'present': 'يَفْعُلُ', 'masdar': 'فُعْل'},
                'فَعَّلَ': {'past': 'فَعَّلَ', 'present': 'يُفَعِّلُ', 'masdar': 'تَفْعِيل'},
                'فَاعَلَ': {'past': 'فَاعَلَ', 'present': 'يُفَاعِلُ', 'masdar': 'مُفَاعَلَة'},
                'أَفْعَلَ': {'past': 'أَفْعَلَ', 'present': 'يُفْعِلُ', 'masdar': 'إِفْعَال'},
                'تَفَعَّلَ': {'past': 'تَفَعَّلَ', 'present': 'يَتَفَعَّلُ', 'masdar': 'تَفَعُّل'},
                'تَفَاعَلَ': {'past': 'تَفَاعَلَ', 'present': 'يَتَفَاعَلُ', 'masdar': 'تَفَاعُل'},
                'اِنْفَعَلَ': {'past': 'اِنْفَعَلَ', 'present': 'يَنْفَعِلُ', 'masdar': 'اِنْفِعَال'},
                'اِفْتَعَلَ': {'past': 'اِفْتَعَلَ', 'present': 'يَفْتَعِلُ', 'masdar': 'اِفْتِعَال'}
            },
            'irregular': {
                'hollow': {
                    'قَالَ': {'past': 'قَالَ', 'present': 'يَقُولُ', 'masdar': 'قَوْل'},
                    'بَاعَ': {'past': 'بَاعَ', 'present': 'يَبِيعُ', 'masdar': 'بَيْع'}
                },
                'hamzated': {
                    'أَكَلَ': {'past': 'أَكَلَ', 'present': 'يَأْكُلُ', 'masdar': 'أَكْل'},
                    'سَأَلَ': {'past': 'سَأَلَ', 'present': 'يَسْأَلُ', 'masdar': 'سُؤَال'}
                },
                'doubled': {
                    'مَدَّ': {'past': 'مَدَّ', 'present': 'يَمُدُّ', 'masdar': 'مَدّ'},
                    'فَرَّ': {'past': 'فَرَّ', 'present': 'يَفِرُّ', 'masdar': 'فِرَار'}
                }
            }
        }
        
        # Case endings (إعراب)
        self.case_endings = {
            'rafa': {
                'singular': 'ُ',
                'dual': 'َانِ',
                'plural_masc': 'ُونَ',
                'plural_fem': 'َاتُ'
            },
            'nasb': {
                'singular': 'َ',
                'dual': 'َيْنِ',
                'plural_masc': 'ِينَ',
                'plural_fem': 'َاتِ'
            },
            'jarr': {
                'singular': 'ِ',
                'dual': 'َيْنِ',
                'plural_masc': 'ِينَ',
                'plural_fem': 'َاتِ'
            }
        }
        
        # Pronouns and their conjugations
        self.pronouns = {
            'singular': {
                'first': 'أَنَا',
                'second_masc': 'أَنْتَ',
                'second_fem': 'أَنْتِ',
                'third_masc': 'هُوَ',
                'third_fem': 'هِيَ'
            },
            'dual': {
                'second': 'أَنْتُمَا',
                'third_masc': 'هُمَا',
                'third_fem': 'هُمَا'
            },
            'plural': {
                'first': 'نَحْنُ',
                'second_masc': 'أَنْتُمْ',
                'second_fem': 'أَنْتُنَّ',
                'third_masc': 'هُمْ',
                'third_fem': 'هُنَّ'
            }
        }

    def get_pattern(self, word: str) -> dict:
        """
        Identify the morphological pattern of a given word
        """
        # Check regular patterns
        for pattern, forms in self.patterns['regular'].items():
            if self._matches_pattern(word, pattern):
                return {'type': 'regular', 'pattern': pattern, 'forms': forms}
        
        # Check irregular patterns
        for irreg_type, patterns in self.patterns['irregular'].items():
            for pattern, forms in patterns.items():
                if self._matches_pattern(word, pattern):
                    return {'type': 'irregular', 'subtype': irreg_type, 'pattern': pattern, 'forms': forms}
        
        return None

    def _matches_pattern(self, word: str, pattern: str) -> bool:
        """
        Check if a word matches a given pattern
        """
        # Remove diacritics for comparison
        word_clean = self._remove_diacritics(word)
        pattern_clean = self._remove_diacritics(pattern)
        
        if len(word_clean) != len(pattern_clean):
            return False
            
        # Extract root letters from pattern (typically ف ع ل)
        root_positions = [i for i, char in enumerate(pattern_clean) 
                         if char in 'فعل']
        
        # Check if the word follows the pattern structure
        return all(word_clean[i] == pattern_clean[i] 
                  for i in range(len(pattern_clean)) 
                  if i not in root_positions)

    def add_case_ending(self, word: str, case: str, number: str) -> str:
        """
        Add the appropriate case ending to a word
        """
        if case not in self.case_endings or number not in self.case_endings[case]:
            return word
        
        return word + self.case_endings[case][number]

    def conjugate(self, root: str, pattern: str, pronoun: str) -> str:
        """
        Conjugate a verb according to its pattern and pronoun
        """
        if pattern not in self.patterns['regular']:
            return None
            
        pattern_forms = self.patterns['regular'][pattern]
        
        # Basic conjugation logic (simplified)
        if pronoun in self.pronouns['singular']:
            return pattern_forms['present']  # This is simplified
        elif pronoun in self.pronouns['dual']:
            return pattern_forms['present'] + 'ان'
        elif pronoun in self.pronouns['plural']:
            return pattern_forms['present'] + 'ون'
            
        return None

    @staticmethod
    def _remove_diacritics(text: str) -> str:
        """
        Remove Arabic diacritical marks from text
        """
        diacritics = ['َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ً', 'ٌ', 'ٍ']
        return ''.join(char for char in text if char not in diacritics)

    def analyze_word(self, word: str) -> dict:
        """
        Perform complete morphological analysis of a word
        """
        analysis = {
            'original': word,
            'without_diacritics': self._remove_diacritics(word),
            'pattern': self.get_pattern(word),
            'possible_cases': {}
        }
        
        # Add possible case endings
        for case in self.case_endings:
            analysis['possible_cases'][case] = {
                'singular': self.add_case_ending(word, case, 'singular'),
                'dual': self.add_case_ending(word, case, 'dual'),
                'plural_masc': self.add_case_ending(word, case, 'plural_masc'),
                'plural_fem': self.add_case_ending(word, case, 'plural_fem')
            }
            
        return analysis

# Example usage
if __name__ == "__main__":
    analyzer = ArabicMorphologicalAnalyzer()
    
    # Example analysis
    word = "كَتَبَ"
    analysis = analyzer.analyze_word(word)
    print(f"Analysis of {word}:")
    print(f"Pattern: {analysis['pattern']}")
    print("\nPossible cases:")
    for case, forms in analysis['possible_cases'].items():
        print(f"\n{case.upper()}:")
        for number, form in forms.items():
            print(f"  {number}: {form}")

Analysis of كَتَبَ:
Pattern: {'type': 'regular', 'pattern': 'فَعَلَ', 'forms': {'past': 'فَعَلَ', 'present': 'يَفْعَلُ', 'masdar': 'فَعْل'}}

Possible cases:

RAFA:
  singular: كَتَبَُ
  dual: كَتَبََانِ
  plural_masc: كَتَبَُونَ
  plural_fem: كَتَبََاتُ

NASB:
  singular: كَتَبََ
  dual: كَتَبََيْنِ
  plural_masc: كَتَبَِينَ
  plural_fem: كَتَبََاتِ

JARR:
  singular: كَتَبَِ
  dual: كَتَبََيْنِ
  plural_masc: كَتَبَِينَ
  plural_fem: كَتَبََاتِ


# Word and sentences analysis

In [4]:
class ArabicMorphologicalAnalyzer:
    def __init__(self):
        # Previous initializations remain (patterns, case_endings, pronouns)
        # Adding ranking points for different features
        self.rank_points = {
            'case': {
                'rafa': 3,    # مرفوع
                'nasb': 2,    # منصوب
                'jarr': 1     # مجرور
            },
            'tense': {
                'past': 1,    # ماضي
                'present': 2, # مضارع
                'future': 3   # مستقبل
            },
            'number': {
                'singular': 1, # مفرد
                'dual': 2,    # مثنى
                'plural': 3   # جمع
            },
            'gender': {
                'masculine': 1, # مذكر
                'feminine': 2  # مؤنث
            }
        }
        
        # Adding markers for different features
        self.markers = {
            'future': 'س',  # Future marker س
            'dual_markers': ['ان', 'ين'],
            'plural_markers': {
                'masculine': ['ون', 'ين'],
                'feminine': ['ات']
            },
            'feminine_markers': ['ة', 'ى', 'اء']
        }
        
        # Previous patterns, case_endings, and pronouns code remains here...

    def analyze_sentence(self, sentence: str) -> dict:
        """
        Analyze a complete sentence with rankings for each word
        """
        words = sentence.split()
        analysis = {
            'full_sentence': sentence,
            'word_count': len(words),
            'words': [],
            'total_rank': 0
        }
        
        for word in words:
            word_analysis = self.analyze_word_comprehensive(word)
            analysis['words'].append(word_analysis)
            analysis['total_rank'] += word_analysis['total_rank']
            
        return analysis

    def analyze_word_comprehensive(self, word: str) -> dict:
        """
        Comprehensive word analysis with ranking points
        """
        analysis = {
            'word': word,
            'features': {
                'case': self._determine_case(word),
                'tense': self._determine_tense(word),
                'number': self._determine_number(word),
                'gender': self._determine_gender(word)
            },
            'rankings': {},
            'total_rank': 0
        }
        
        # Calculate rankings for each feature
        for feature, value in analysis['features'].items():
            if value in self.rank_points[feature]:
                rank = self.rank_points[feature][value]
                analysis['rankings'][feature] = rank
                analysis['total_rank'] += rank
        
        return analysis

    def _determine_case(self, word: str) -> str:
        """
        Determine the grammatical case of a word
        """
        word = self._remove_diacritics(word)
        
        # Case determination rules
        if word.endswith('ُ') or word.endswith('ُن') or word.endswith('ون'):
            return 'rafa'
        elif word.endswith('َ') or word.endswith('ًا') or word.endswith('ين'):
            return 'nasb'
        elif word.endswith('ِ') or word.endswith('ٍ'):
            return 'jarr'
        
        # Default case based on position and context would go here
        return 'rafa'  # Default case

    def _determine_tense(self, word: str) -> str:
        """
        Determine the tense of a word
        """
        # Remove diacritics for easier matching
        word = self._remove_diacritics(word)
        
        # Future tense check
        if word.startswith('س') or word.startswith('سوف'):
            return 'future'
            
        # Present tense markers (ي، ت، أ، ن)
        present_markers = ['ي', 'ت', 'أ', 'ن']
        if any(word.startswith(marker) for marker in present_markers):
            return 'present'
            
        # Past tense typically ends with appropriate suffixes
        past_endings = ['ت', 'تما', 'تم', 'تن', 'نا']
        if any(word.endswith(ending) for ending in past_endings):
            return 'past'
            
        return 'present'  # Default tense

    def _determine_number(self, word: str) -> str:
        """
        Determine if word is singular, dual, or plural
        """
        word = self._remove_diacritics(word)
        
        # Check dual markers
        if any(word.endswith(marker) for marker in self.markers['dual_markers']):
            return 'dual'
            
        # Check plural markers
        if (any(word.endswith(marker) for marker in self.markers['plural_markers']['masculine']) or
            any(word.endswith(marker) for marker in self.markers['plural_markers']['feminine'])):
            return 'plural'
            
        return 'singular'  # Default number

    def _determine_gender(self, word: str) -> str:
        """
        Determine if word is masculine or feminine
        """
        word = self._remove_diacritics(word)
        
        # Check feminine markers
        if any(word.endswith(marker) for marker in self.markers['feminine_markers']):
            return 'feminine'
            
        return 'masculine'  # Default gender

    def analyze_phrase(self, phrase: str) -> dict:
        """
        Analyze a phrase (smaller than sentence) with context
        """
        words = phrase.split()
        analysis = {
            'phrase': phrase,
            'word_count': len(words),
            'words': [],
            'phrase_type': self._determine_phrase_type(words),
            'total_rank': 0
        }
        
        for i, word in enumerate(words):
            word_analysis = self.analyze_word_comprehensive(word)
            # Add contextual analysis based on position in phrase
            word_analysis['position'] = self._analyze_position(i, len(words))
            analysis['words'].append(word_analysis)
            analysis['total_rank'] += word_analysis['total_rank']
            
        return analysis

    def _determine_phrase_type(self, words: list) -> str:
        """
        Determine the type of phrase (nominal or verbal)
        """
        if not words:
            return 'unknown'
            
        first_word = self._remove_diacritics(words[0])
        
        # Check if phrase starts with a verb
        if self._determine_tense(first_word) in ['past', 'present', 'future']:
            return 'verbal_phrase'
            
        return 'nominal_phrase'

    def _analyze_position(self, index: int, total_words: int) -> str:
        """
        Analyze word position in phrase/sentence
        """
        if index == 0:
            return 'initial'
        elif index == total_words - 1:
            return 'final'
        else:
            return 'middle'

    def get_total_score(self, text: str) -> dict:
        """
        Get comprehensive score for text (word/phrase/sentence)
        """
        words = text.split()
        if len(words) == 1:
            return self.analyze_word_comprehensive(text)
        elif len(words) <= 3:
            return self.analyze_phrase(text)
        else:
            return self.analyze_sentence(text)
        
    @staticmethod
    def _remove_diacritics(text: str) -> str:
        """
        Remove Arabic diacritical marks from text
        """
        diacritics = ['َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ً', 'ٌ', 'ٍ']
        return ''.join(char for char in text if char not in diacritics)

    def analyze_word(self, word: str) -> dict:
        """
        Perform complete morphological analysis of a word
        """
        analysis = {
            'original': word,
            'without_diacritics': self._remove_diacritics(word),
            'pattern': self.get_pattern(word),
            'possible_cases': {}
        }
        
        # Add possible case endings
        for case in self.case_endings:
            analysis['possible_cases'][case] = {
                'singular': self.add_case_ending(word, case, 'singular'),
                'dual': self.add_case_ending(word, case, 'dual'),
                'plural_masc': self.add_case_ending(word, case, 'plural_masc'),
                'plural_fem': self.add_case_ending(word, case, 'plural_fem')
            }
            
        return analysis

# Example usage
if __name__ == "__main__":
    analyzer = ArabicMorphologicalAnalyzer()
    
    # Example analyses
    word = "يَكْتُبُ"  # Present tense, masculine, singular
    phrase = "الكتابُ الجديدُ"  # Nominal phrase
    sentence = "كَتَبَ الطالبُ الدرسَ"  # Verbal sentence
    
    # Single word analysis
    word_analysis = analyzer.get_total_score(word)
    print(f"\nWord Analysis for '{word}':")
    print(f"Features: {word_analysis['features']}")
    print(f"Rankings: {word_analysis['rankings']}")
    print(f"Total Rank: {word_analysis['total_rank']}")
    
    # Phrase analysis
    phrase_analysis = analyzer.get_total_score(phrase)
    print(f"\nPhrase Analysis for '{phrase}':")
    print(f"Phrase Type: {phrase_analysis['phrase_type']}")
    print(f"Total Rank: {phrase_analysis['total_rank']}")
    
    # Sentence analysis
    sentence_analysis = analyzer.get_total_score(sentence)
    print(f"\nSentence Analysis for '{sentence}':")
    print(f"Word Count: {sentence_analysis['word_count']}")
    print(f"Total Rank: {sentence_analysis['total_rank']}")


Word Analysis for 'يَكْتُبُ':
Features: {'case': 'rafa', 'tense': 'present', 'number': 'singular', 'gender': 'masculine'}
Rankings: {'case': 3, 'tense': 2, 'number': 1, 'gender': 1}
Total Rank: 7

Phrase Analysis for 'الكتابُ الجديدُ':
Phrase Type: verbal_phrase
Total Rank: 14

Sentence Analysis for 'كَتَبَ الطالبُ الدرسَ':
Word Count: 3
Total Rank: 21


# Token generation

In [13]:
# class ArabicTokenGenerator:
#     def __init__(self):
#         # Token types with their respective features
#         self.token_types = {
#             'WORD': 1,
#             'PUNCTUATION': 2,
#             'NUMBER': 3,
#             'SYMBOL': 4
#         }
        
#         # Token features based on grammatical rules
#         self.token_features = {
#             'case': ['rafa', 'nasb', 'jarr'],
#             'tense': ['past', 'present', 'future'],
#             'number': ['singular', 'dual', 'plural'],
#             'gender': ['masculine', 'feminine']
#         }
        
#         # Special characters and their token types
#         self.special_chars = {
#             '.': 'PUNCTUATION',
#             '،': 'PUNCTUATION',
#             '؟': 'PUNCTUATION',
#             '!': 'PUNCTUATION',
#             '@': 'SYMBOL',
#             '#': 'SYMBOL'
#         }

# class ArabicMorphologicalAnalyzer:
#     def __init__(self):
#         # Previous initializations remain...
#         self.token_generator = ArabicTokenGenerator()
        
#         # Token position markers
#         self.position_markers = {
#             'START': 'S',
#             'MIDDLE': 'M',
#             'END': 'E',
#             'SINGLE': 'SI'
#         }

#     def generate_tokens(self, text: str) -> list:
#         """
#         Generate tokens for given text based on morphological rules
#         """
#         analysis = self.get_total_score(text)
#         tokens = []
        
#         if isinstance(analysis['words'], list):
#             # For phrases and sentences
#             for i, word_analysis in enumerate(analysis['words']):
#                 token = self._create_token(word_analysis, i, len(analysis['words']))
#                 tokens.append(token)
#         else:
#             # For single words
#             tokens.append(self._create_token(analysis, 0, 1))
            
#         return tokens

#     def _create_token(self, word_analysis: dict, position: int, total_words: int) -> dict:
#         """
#         Create a token with all relevant features
#         """
#         # Generate unique token ID
#         token_id = self._generate_token_id(word_analysis['word'], position)
        
#         # Determine position marker
#         position_marker = self._get_position_marker(position, total_words)
        
#         return {
#             'token_id': token_id,
#             'word': word_analysis['word'],
#             'features': word_analysis['features'],
#             'position': position_marker,
#             'rank': word_analysis['total_rank'],
#             'type': self._determine_token_type(word_analysis['word']),
#             'metadata': {
#                 'position_index': position,
#                 'total_positions': total_words,
#                 'normalized_form': self._normalize_word(word_analysis['word'])
#             }
#         }

#     def _generate_token_id(self, word: str, position: int) -> str:
#         """
#         Generate unique token ID based on word and position
#         """
#         normalized_word = self._normalize_word(word)
#         return f"TK_{normalized_word}_{position}"

#     def _get_position_marker(self, position: int, total_words: int) -> str:
#         """
#         Determine position marker for token
#         """
#         if total_words == 1:
#             return self.position_markers['SINGLE']
#         elif position == 0:
#             return self.position_markers['START']
#         elif position == total_words - 1:
#             return self.position_markers['END']
#         else:
#             return self.position_markers['MIDDLE']

#     def _determine_token_type(self, word: str) -> str:
#         """
#         Determine token type based on word characteristics
#         """
#         if word in self.token_generator.special_chars:
#             return self.token_generator.special_chars[word]
#         elif word.isdigit():
#             return 'NUMBER'
#         else:
#             return 'WORD'

#     def generate_sentence_tokens(self, sentence: str) -> list:
#         """
#         Generate tokens for a complete sentence with context
#         """
#         words = sentence.split()
#         tokens = []
        
#         for i, word in enumerate(words):
#             # Get word analysis
#             word_analysis = self.analyze_word_comprehensive(word)
            
#             # Create base token
#             token = self._create_token(word_analysis, i, len(words))
            
#             # Add contextual information
#             token['context'] = {
#                 'previous': words[i-1] if i > 0 else None,
#                 'next': words[i+1] if i < len(words)-1 else None,
#                 'sentence_position': self._get_sentence_position(i, len(words))
#             }
            
#             tokens.append(token)
            
#         return tokens

#     def _get_sentence_position(self, index: int, total_words: int) -> str:
#         """
#         Get detailed sentence position information
#         """
#         if total_words <= 1:
#             return 'COMPLETE'
#         elif index == 0:
#             return 'BEGINNING'
#         elif index == total_words - 1:
#             return 'END'
#         else:
#             return f'MIDDLE_{index}/{total_words}'

#     def _normalize_word(self, word: str) -> str:
#         """
#         Normalize word for token ID generation
#         """
#         # Remove diacritics and special characters
#         normalized = self._remove_diacritics(word)
#         # Replace spaces and special characters
#         normalized = normalized.replace(' ', '_')
#         return normalized

#     def tokenize_text(self, text: str) -> dict:
#         """
#         Complete tokenization of text with analysis
#         """
#         return {
#             'original_text': text,
#             'tokens': self.generate_tokens(text),
#             'token_count': len(text.split()),
#             'analysis': self.get_total_score(text),
#             'metadata': {
#                 'has_punctuation': any(char in text for char in self.token_generator.special_chars),
#                 'normalized_text': self._normalize_word(text)
#             }
#         }

# # Example usage
# if __name__ == "__main__":
#     analyzer = ArabicMorphologicalAnalyzer()
    
#     # Example text
#     word = "يَكْتُبُ"
#     phrase = "الكتابُ الجديدُ"
#     sentence = "كَتَبَ الطالبُ الدرسَ"
    
#     # Generate tokens for different text types
#     word_tokens = analyzer.tokenize_text(word)
#     phrase_tokens = analyzer.tokenize_text(phrase)
#     sentence_tokens = analyzer.tokenize_text(sentence)
    
#     # Print results
#     print("\nWord Tokens:")
#     print(word_tokens)
    
#     print("\nPhrase Tokens:")
#     print(phrase_tokens)
    
#     print("\nSentence Tokens:")
#     print(sentence_tokens)

In [14]:
class UnifiedArabicAnalyzer:
    def __init__(self):
        # Patterns (أوزان) for trilateral roots
        self.patterns = {
            'regular': {
                'فَعَلَ': {'past': 'فَعَلَ', 'present': 'يَفْعَلُ', 'masdar': 'فَعْل'},
                'فَعِلَ': {'past': 'فَعِلَ', 'present': 'يَفْعَلُ', 'masdar': 'فَعَل'},
                'فَعُلَ': {'past': 'فَعُلَ', 'present': 'يَفْعُلُ', 'masdar': 'فُعْل'},
                'فَعَّلَ': {'past': 'فَعَّلَ', 'present': 'يُفَعِّلُ', 'masdar': 'تَفْعِيل'},
                'فَاعَلَ': {'past': 'فَاعَلَ', 'present': 'يُفَاعِلُ', 'masdar': 'مُفَاعَلَة'}
            },
            'irregular': {
                'hollow': {
                    'قَالَ': {'past': 'قَالَ', 'present': 'يَقُولُ', 'masdar': 'قَوْل'},
                    'بَاعَ': {'past': 'بَاعَ', 'present': 'يَبِيعُ', 'masdar': 'بَيْع'}
                },
                'hamzated': {
                    'أَكَلَ': {'past': 'أَكَلَ', 'present': 'يَأْكُلُ', 'masdar': 'أَكْل'},
                    'سَأَلَ': {'past': 'سَأَلَ', 'present': 'يَسْأَلُ', 'masdar': 'سُؤَال'}
                }
            }
        }
        
        # Ranking points for different features
        self.rank_points = {
            'case': {'rafa': 3, 'nasb': 2, 'jarr': 1},
            'tense': {'future': 3, 'present': 2, 'past': 1},
            'number': {'plural': 3, 'dual': 2, 'singular': 1},
            'gender': {'feminine': 2, 'masculine': 1}
        }
        
        # Token types and markers
        self.token_types = {
            'WORD': 1,
            'PUNCTUATION': 2,
            'NUMBER': 3,
            'SYMBOL': 4
        }
        
        # Position markers
        self.position_markers = {
            'START': 'S',
            'MIDDLE': 'M',
            'END': 'E',
            'SINGLE': 'SI'
        }
        
        # Feature markers
        self.markers = {
            'future': 'س',
            'dual_markers': ['ان', 'ين'],
            'plural_markers': {
                'masculine': ['ون', 'ين'],
                'feminine': ['ات']
            },
            'feminine_markers': ['ة', 'ى', 'اء']
        }

    def analyze(self, text: str) -> dict:
        """
        Unified analysis method that combines morphological analysis, ranking, and tokenization
        """
        words = text.split()
        
        result = {
            'original_text': text,
            'word_count': len(words),
            'analysis_type': self._determine_analysis_type(words),
            'morphological_analysis': [],
            'tokens': [],
            'total_rank': 0,
            'metadata': {
                'normalized_text': self._normalize_text(text),
                'has_diacritics': self._has_diacritics(text)
            }
        }
        
        # Analyze each word
        for i, word in enumerate(words):
            word_analysis = self._analyze_word(word, i, len(words))
            result['morphological_analysis'].append(word_analysis['morphology'])
            result['tokens'].append(word_analysis['token'])
            result['total_rank'] += word_analysis['rank']
            
        return result

    def _analyze_word(self, word: str, position: int, total_words: int) -> dict:
        """
        Comprehensive word analysis including morphology, ranking, and tokenization
        """
        # Morphological analysis
        morphology = {
            'word': word,
            'pattern': self._get_pattern(word),
            'features': self._get_features(word),
            'root': self._extract_root(word)
        }
        
        # Calculate rank
        features = morphology['features']
        rank = sum(self.rank_points[feature][value] 
                  for feature, value in features.items() 
                  if feature in self.rank_points and value in self.rank_points[feature])
        
        # Generate token
        token = {
            'token_id': f"TK_{self._normalize_text(word)}_{position}",
            'word': word,
            'type': self._determine_token_type(word),
            'position': self._get_position_marker(position, total_words),
            'features': features
        }
        
        return {
            'morphology': morphology,
            'token': token,
            'rank': rank
        }

    def _get_features(self, word: str) -> dict:
        """
        Extract all grammatical features from a word
        """
        return {
            'case': self._determine_case(word),
            'tense': self._determine_tense(word),
            'number': self._determine_number(word),
            'gender': self._determine_gender(word)
        }

    def _determine_case(self, word: str) -> str:
        """
        Determine grammatical case
        """
        word = self._remove_diacritics(word)
        if word.endswith(('ُ', 'ُن', 'ون')):
            return 'rafa'
        elif word.endswith(('َ', 'ًا', 'ين')):
            return 'nasb'
        elif word.endswith(('ِ', 'ٍ')):
            return 'jarr'
        return 'rafa'

    def _determine_tense(self, word: str) -> str:
        """
        Determine verb tense
        """
        if word.startswith(self.markers['future']):
            return 'future'
        elif any(word.startswith(m) for m in ['ي', 'ت', 'أ', 'ن']):
            return 'present'
        return 'past'

    def _determine_number(self, word: str) -> str:
        """
        Determine grammatical number
        """
        word = self._remove_diacritics(word)
        if any(word.endswith(m) for m in self.markers['dual_markers']):
            return 'dual'
        elif (any(word.endswith(m) for m in self.markers['plural_markers']['masculine']) or
              any(word.endswith(m) for m in self.markers['plural_markers']['feminine'])):
            return 'plural'
        return 'singular'

    def _determine_gender(self, word: str) -> str:
        """
        Determine grammatical gender
        """
        if any(word.endswith(m) for m in self.markers['feminine_markers']):
            return 'feminine'
        return 'masculine'

    def _get_pattern(self, word: str) -> dict:
        """
        Match word to morphological pattern
        """
        for pattern_type, patterns in self.patterns.items():
            for pattern, forms in patterns.items():
                if self._matches_pattern(word, pattern):
                    return {'type': pattern_type, 'pattern': pattern, 'forms': forms}
        return None

    def _matches_pattern(self, word: str, pattern: str) -> bool:
        """
        Check if word matches given pattern
        """
        word_clean = self._remove_diacritics(word)
        pattern_clean = self._remove_diacritics(pattern)
        
        if len(word_clean) != len(pattern_clean):
            return False
            
        root_positions = [i for i, char in enumerate(pattern_clean) if char in 'فعل']
        return all(word_clean[i] == pattern_clean[i] for i in range(len(pattern_clean)) 
                  if i not in root_positions)

    def _extract_root(self, word: str) -> str:
        """
        Extract root letters from word
        """
        # Simplified root extraction
        word = self._remove_diacritics(word)
        # Remove common prefixes and suffixes
        prefixes = ['است', 'ست', 'ان', 'ال', 'مت', 'مس', 'م']
        suffixes = ['ون', 'ات', 'ان', 'ين', 'ة', 'ه', 'ي', 'ت']
        
        for prefix in prefixes:
            if word.startswith(prefix):
                word = word[len(prefix):]
                break
                
        for suffix in suffixes:
            if word.endswith(suffix):
                word = word[:-len(suffix)]
                break
                
        return word

    @staticmethod
    def _remove_diacritics(text: str) -> str:
        """
        Remove Arabic diacritical marks
        """
        diacritics = ['َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ً', 'ٌ', 'ٍ']
        return ''.join(char for char in text if char not in diacritics)

    @staticmethod
    def _normalize_text(text: str) -> str:
        """
        Normalize Arabic text
        """
        text = text.strip()
        return ''.join(char for char in text if char.isalnum() or char.isspace())

    def _determine_analysis_type(self, words: list) -> str:
        """
        Determine type of text being analyzed
        """
        if len(words) == 0:
            return 'empty'
        elif len(words) == 1:
            return 'word'
        elif len(words) <= 3:
            return 'phrase'
        else:
            return 'sentence'

    def _get_position_marker(self, position: int, total_words: int) -> str:
        """
        Get position marker for token
        """
        if total_words == 1:
            return self.position_markers['SINGLE']
        elif position == 0:
            return self.position_markers['START']
        elif position == total_words - 1:
            return self.position_markers['END']
        return self.position_markers['MIDDLE']

    def _determine_token_type(self, word: str) -> int:
        """
        Determine token type
        """
        if word.isdigit():
            return self.token_types['NUMBER']
        elif any(char in word for char in '،.؟!@#'):
            return self.token_types['PUNCTUATION']
        return self.token_types['WORD']

    @staticmethod
    def _has_diacritics(text: str) -> bool:
        """
        Check if text contains diacritics
        """
        diacritics = ['َ', 'ُ', 'ِ', 'ّ', 'ْ', 'ً', 'ٌ', 'ٍ']
        return any(char in text for char in diacritics)

# Example usage
if __name__ == "__main__":
    analyzer = UnifiedArabicAnalyzer()
    
    # Test different types of text
    word = "يَكْتُبُ"
    phrase = "الكتابُ الجديدُ"
    sentence = "كَتَبَ الطالبُ الدرسَ"
    
    # Analyze each
    word_analysis = analyzer.analyze(word)
    phrase_analysis = analyzer.analyze(phrase)
    sentence_analysis = analyzer.analyze(sentence)
    
    print("\nWord Analysis:")
    print(word_analysis)
    
    print("\nPhrase Analysis:")
    print(phrase_analysis)
    
    print("\nSentence Analysis:")
    print(sentence_analysis)


Word Analysis:
{'original_text': 'يَكْتُبُ', 'word_count': 1, 'analysis_type': 'word', 'morphological_analysis': [{'word': 'يَكْتُبُ', 'pattern': None, 'features': {'case': 'rafa', 'tense': 'present', 'number': 'singular', 'gender': 'masculine'}, 'root': 'يكتب'}], 'tokens': [{'token_id': 'TK_يكتب_0', 'word': 'يَكْتُبُ', 'type': 1, 'position': 'SI', 'features': {'case': 'rafa', 'tense': 'present', 'number': 'singular', 'gender': 'masculine'}}], 'total_rank': 7, 'metadata': {'normalized_text': 'يكتب', 'has_diacritics': True}}

Phrase Analysis:
{'original_text': 'الكتابُ الجديدُ', 'word_count': 2, 'analysis_type': 'phrase', 'morphological_analysis': [{'word': 'الكتابُ', 'pattern': None, 'features': {'case': 'rafa', 'tense': 'past', 'number': 'singular', 'gender': 'masculine'}, 'root': 'كتاب'}, {'word': 'الجديدُ', 'pattern': None, 'features': {'case': 'rafa', 'tense': 'past', 'number': 'singular', 'gender': 'masculine'}, 'root': 'جديد'}], 'tokens': [{'token_id': 'TK_الكتاب_0', 'word': 'ال