<a href="https://colab.research.google.com/github/YUJIHARISH/NLP/blob/main/22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import spacy

class ReferenceResolver:
    """
    A class to resolve references (pronouns, demonstratives, etc.) to their antecedents in text.
    """

    def __init__(self):
        # Load spaCy English language model
        self.nlp = spacy.load("en_core_web_sm")

        # Define pronoun sets
        self.personal_pronouns = {
            'male': {'he', 'him', 'his', 'himself'},
            'female': {'she', 'her', 'hers', 'herself'},
            'neutral': {'it', 'its', 'itself'},
            'plural': {'they', 'them', 'their', 'theirs', 'themselves'}
        }

        self.demonstratives = {'this', 'that', 'these', 'those'}

    def preprocess_text(self, text):
        """Process text with spaCy to get linguistic features."""
        return self.nlp(text)

    def extract_candidate_mentions(self, doc):
        """Extract all noun phrases and named entities as potential antecedents."""
        candidates = {}

        # Extract named entities
        for ent in doc.ents:
            candidates[ent.text] = {
                'span': ent,
                'type': ent.label_,
                'gender': self._infer_gender(ent),
                'is_plural': self._is_plural(ent),
                'position': ent.start
            }

        # Extract noun phrases
        for chunk in doc.noun_chunks:
            if chunk.text not in candidates:
                candidates[chunk.text] = {
                    'span': chunk,
                    'type': 'NP',
                    'gender': self._infer_gender(chunk),
                    'is_plural': self._is_plural(chunk),
                    'position': chunk.start
                }

        return candidates

    def _infer_gender(self, span):
        """Attempt to infer gender of an entity or noun phrase."""
        # This is a simplified version - in practice would use more sophisticated methods
        text = span.text.lower()

        # Check for gendered pronouns within the span
        if any(word in text for word in self.personal_pronouns['male']):
            return 'male'
        if any(word in text for word in self.personal_pronouns['female']):
            return 'female'

        # Check for gendered words
        male_indicators = ['man', 'boy', 'father', 'brother', 'son', 'uncle']
        female_indicators = ['woman', 'girl', 'mother', 'sister', 'daughter', 'aunt']

        if any(word in text for word in male_indicators):
            return 'male'
        if any(word in text for word in female_indicators):
            return 'female'

        # Default to unknown
        return 'unknown'

    def _is_plural(self, span):
        """Determine if the noun phrase is plural."""
        # Check the root of the span
        for token in span:
            if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                return token.tag_ == "NNS" or token.tag_ == "NNPS"

        # Default assumption
        return False

    def identify_references(self, doc):
        """Identify all references (pronouns, etc.) in the text."""
        references = []

        for token in doc:
            if token.text.lower() in [p for pronoun_set in self.personal_pronouns.values() for p in pronoun_set]:
                references.append({
                    'text': token.text,
                    'position': token.i,
                    'type': 'personal_pronoun',
                    'gender': self._get_pronoun_gender(token.text.lower()),
                    'is_plural': token.text.lower() in self.personal_pronouns['plural']
                })
            elif token.text.lower() in self.demonstratives:
                references.append({
                    'text': token.text,
                    'position': token.i,
                    'type': 'demonstrative',
                    'is_plural': token.text.lower() in ['these', 'those']
                })

        return references

    def _get_pronoun_gender(self, pronoun):
        """Get the gender of a pronoun."""
        for gender, pronouns in self.personal_pronouns.items():
            if pronoun in pronouns:
                return gender
        return 'unknown'

    def resolve_references(self, text):
        """Resolve references in the given text."""
        doc = self.preprocess_text(text)
        candidates = self.extract_candidate_mentions(doc)
        references = self.identify_references(doc)

        resolutions = {}

        for ref in references:
            antecedent = self._find_antecedent(ref, candidates, doc)
            if antecedent:
                resolutions[ref['text'] + '_' + str(ref['position'])] = {
                    'reference': ref['text'],
                    'ref_position': ref['position'],
                    'antecedent': antecedent,
                    'sentence': self._get_sentence(doc, ref['position'])
                }
            else:
                resolutions[ref['text'] + '_' + str(ref['position'])] = {
                    'reference': ref['text'],
                    'ref_position': ref['position'],
                    'antecedent': None,
                    'sentence': self._get_sentence(doc, ref['position'])
                }

        return resolutions

    def _find_antecedent(self, reference, candidates, doc):
        """Find the most likely antecedent for a reference."""
        # Get candidate antecedents that appear before the reference
        potential_antecedents = {}
        for text, info in candidates.items():
            if info['position'] < reference['position']:
                # Calculate recency score (higher for more recent mentions)
                recency_score = 1.0 / (reference['position'] - info['position'])

                # Calculate gender/number agreement score
                agreement_score = 0

                if reference['type'] == 'personal_pronoun':
                    # Check gender agreement
                    if reference['gender'] == info['gender'] or info['gender'] == 'unknown' or reference['gender'] == 'unknown':
                        agreement_score += 1

                    # Check number agreement
                    if reference['is_plural'] == info['is_plural']:
                        agreement_score += 1

                # Calculate final score
                final_score = recency_score * (1 + agreement_score)
                potential_antecedents[text] = final_score

        # Get the most likely antecedent (highest score)
        if potential_antecedents:
            return max(potential_antecedents.items(), key=lambda x: x[1])[0]

        return None

    def _get_sentence(self, doc, token_position):
        """Get the sentence containing the token at the given position."""
        for sent in doc.sents:
            if sent.start <= token_position < sent.end:
                return sent.text
        return ""

    def format_output(self, resolutions):
        """Format the resolutions for display."""
        result = []
        for ref_id, info in resolutions.items():
            if info['antecedent']:
                result.append(f"Reference: '{info['reference']}' -> Antecedent: '{info['antecedent']}'")
                result.append(f"    Sentence: \"{info['sentence']}\"\n")
            else:
                result.append(f"Reference: '{info['reference']}' -> No clear antecedent found")
                result.append(f"    Sentence: \"{info['sentence']}\"\n")

        return "\n".join(result)


def main():
    # Example usage
    resolver = ReferenceResolver()

    sample_text = """
    John went to the store because he needed groceries.
    Mary saw him there while she was shopping.
    They decided to have coffee together.
    The coffee shop was busy but they found a table.
    This made their day much better.
    """

    print("Analyzing text:\n" + sample_text + "\n")

    resolutions = resolver.resolve_references(sample_text)
    formatted_output = resolver.format_output(resolutions)

    print("Reference Resolution Results:")
    print(formatted_output)


if __name__ == "__main__":
    main()

Analyzing text:

    John went to the store because he needed groceries.
    Mary saw him there while she was shopping.
    They decided to have coffee together.
    The coffee shop was busy but they found a table.
    This made their day much better.
    

Reference Resolution Results:
Reference: 'he' -> Antecedent: 'the store'
    Sentence: "
    John went to the store because he needed groceries.
    "

Reference: 'him' -> Antecedent: 'Mary'
    Sentence: "Mary saw him there while she was shopping.
    "

Reference: 'she' -> Antecedent: 'him'
    Sentence: "Mary saw him there while she was shopping.
    "

Reference: 'They' -> Antecedent: 'groceries'
    Sentence: "They decided to have coffee together.
    "

Reference: 'they' -> Antecedent: 'coffee'
    Sentence: "The coffee shop was busy but they found a table.
    "

Reference: 'This' -> Antecedent: 'a table'
    Sentence: "This made their day much better.
    "

Reference: 'their' -> Antecedent: 'This'
    Sentence: "This made t