# Install and Import

In [1]:
!pip install pandas numpy openpyxl spacy matplotlib seaborn networkx plotly
!pip install transformers torch datasets
!pip install scikit-learn

!python -m spacy download en_core_web_trf
!pip install huggingface_hub
!pip install pyvis #network graph
!pip install dash #interactive dashboard
!pip install rapidfuzz


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m457.4

In [2]:
import pandas as pd
import numpy as np
import spacy
import re
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch
from rapidfuzz import fuzz
from collections import Counter

# Load advanced SpaCy model
nlp = spacy.load("en_core_web_trf")

  model.load_state_dict(torch.load(filelike, map_location=device))


# Load Data

In [3]:
import pandas as pd

data = pd.read_excel("Documents/news_excerpts_parsed.xlsx")[0:20]

In [8]:
data.head()

Unnamed: 0,Link,Text,tokens,relationships
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,"[starbucks, violate, federal, labor, law, incr...",[]
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,"[suspect, plead, guilty, singapores, large, mo...",[]
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,"[meta, fine, recordbreaking, billion, billion,...","[{'source': 'chapter', 'relation': 'set', 'tar..."
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,"[singapore, yearold, man, link, singapores, la...",[]
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,"[department, education, impose, record, millio...",[]


# Clean Data

In [41]:
# Drop rows where 'Text' is empty or NaN
data = data.dropna(subset=['Text'])

# Optional: Remove duplicate rows
data = data.drop_duplicates(subset=['Text'])

# Testing section (getting non-grammatical r/s) -START-

In [47]:
def advanced_tokenization(text):
    """Preprocess and tokenize text, removing unnecessary words."""
    text = re.sub(r'\s+', ' ', text.lower().strip())
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    doc = nlp(text)

    tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and not token.is_punct and token.is_alpha and len(token.lemma_) > 2
    ]

    return tokens

def extract_advanced_entities(text):
    """Extract entities and keep both original & lemmatized versions for better matching."""
    doc = nlp(text)
    
    entities = {
        ent.text.strip(): ent.label_  
        for ent in doc.ents 
        if ent.label_ in ['ORG', 'GPE', 'PERSON', 'LAW', 'NORP', 'FAC', 'MONEY', 'EVENT']
    }

    entity_tokens = set(entities.keys()) | {lemma for e in entities.keys() for lemma in advanced_tokenization(e)}

    return entity_tokens

def extract_advanced_relationships(text):
    """Extract meaningful relationships while filtering out noise."""
    doc = nlp(text)
    relationships = []
    entity_set = extract_advanced_entities(text)

    for sent in doc.sents:
        for token in sent:
            token_lemma = token.lemma_

            # **Core Subject-Verb-Object (SVO) Relationships**
            if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
                obj = [child for child in token.head.children if child.dep_ in ['dobj', 'attr', 'pobj']]
                if obj:
                    source = token.text if token.text in entity_set else token_lemma
                    target = obj[0].text if obj[0].text in entity_set else obj[0].lemma_

                    relationships.append({
                        'source': source,
                        'relation': token.head.lemma_,
                        'target': target,
                        'type': 'SVO'
                    })

            # **Attribution (Authority → Action)**
            if token.dep_ == 'ccomp' and token.head.ent_type_ in ['ORG', 'PERSON', 'LAW']:
                if token.head.text in entity_set:
                    relationships.append({
                        'source': token.head.text,
                        'relation': 'ruled',
                        'target': token_lemma,
                        'type': 'Attribution'
                    })

            # **Causal Relationships (Why something happened)**
            if token.dep_ in ['advcl', 'ccomp'] and token.head.pos_ == 'VERB':
                if token.head.text in entity_set or token_lemma in entity_set:
                    relationships.append({
                        'source': token.head.text,
                        'relation': 'caused',
                        'target': token_lemma,
                        'type': 'Causal'
                    })

            # **Possession & Ownership**
            if token.dep_ in ['poss', 'nmod', 'prep']:
                poss_obj = [child for child in token.children if child.dep_ == 'pobj']
                if poss_obj and token.head.text in entity_set and poss_obj[0].text in entity_set:
                    relationships.append({
                        'source': token.head.text,
                        'relation': 'owns',
                        'target': poss_obj[0].text,
                        'type': 'Possession'
                    })

    # **Fallback Strategy: If No Relationships Found, Infer from Context**
    if not relationships:
        inferred_relationships = infer_relationships_from_context(doc, entity_set)
        if inferred_relationships:
            relationships.extend(inferred_relationships)

    # Remove duplicates
    unique_relationships = []
    seen = set()
    for rel in relationships:
        key = (rel['source'], rel['relation'], rel['target'])
        if key not in seen:
            unique_relationships.append(rel)
            seen.add(key)

    return unique_relationships

data['entities'] = data['Text'].apply(extract_advanced_entities)
data['relationships'] = data['Text'].apply(extract_advanced_relationships)


In [48]:
print(data['relationships'][1])

[]


In [49]:
data.head()

Unnamed: 0,Link,Text,tokens,relationships,entities
0,https://edition.cnn.com/2023/09/29/business/st...,Starbucks violated federal labor law when it i...,"[starbucks, violate, federal, labor, law, incr...","[{'source': 'Starbucks', 'relation': 'violate'...","{labor, Mara-Louise Anzalone, NLRB, relations,..."
1,https://www.channelnewsasia.com/singapore/su-w...,The first suspect to plead guilty in Singapore...,"[suspect, plead, guilty, singapores, large, mo...",[],"{US$2.2 billion, road, Singapore, Su, cambodia..."
2,https://edition.cnn.com/2023/05/22/tech/meta-f...,Meta has been fined a record-breaking €1.2 bil...,"[meta, fine, recordbreaking, billion, billion,...","[{'source': 'Board', 'relation': 'announce', '...","{The European Data Protection Board, states, t..."
3,https://www.channelnewsasia.com/singapore/bill...,SINGAPORE: A 45-year-old man linked to Singapo...,"[singapore, yearold, man, link, singapores, la...",[],"{Singapore, around S$118 million, ruijin, Zhan..."
4,https://edition.cnn.com/2024/03/05/politics/li...,The Department of Education imposed a record $...,"[department, education, impose, record, millio...","[{'source': 'Department', 'relation': 'impose'...","{liberty, Liberty University, an additional $2..."


# Testing section -END-

# Tokenisation

In [None]:
# Tokenize the 'Text' column
data['tokens_1'] = data['Text'].apply(lambda x: [token.text for token in nlp(x)])

In [None]:
print(data['tokens_1'][3])

In [None]:
def advanced_tokenization(text):
    # Preprocessing
    text = re.sub(r'\s+', ' ', text.lower().strip())
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # SpaCy tokenization
    doc = nlp(text)
    
    # Enhanced token filtering
    tokens = [
        token.lemma_ for token in doc 
        if not token.is_stop and 
        not token.is_punct and 
        token.is_alpha and 
        len(token.lemma_) > 2
    ]
    
    return tokens

data['tokens'] = data['Text'].apply(advanced_tokenization)

In [None]:
print(data['tokens'][3])

# Extract Entities

Method 1

In [None]:
def extract_entities_spacy(text):
    doc = nlp(text)
    entities = [(ent.text.strip(), ent.label_) for ent in doc.ents] #.strip() to get rid of whitespaces
    return entities

# Apply to the dataset
data['entities1'] = data['Text'].apply(extract_entities_spacy)

data['filtered_entities'] = data['entities1'].apply(
    lambda entities: [ent for ent in entities if ent[1] in ['ORG', 'GPE', 'PERSON']]
)

In [None]:
print(data['filtered_entities'][4])

In [None]:
import re
from rapidfuzz import process, fuzz

def advanced_entity_deduplication(entities):
    """
    Advanced entity deduplication with smart merging strategies
    """
    def normalize_entity(entity):
        # Comprehensive normalization
        normalized = re.sub(r'\s+', ' ', entity.lower().strip())
        normalized = re.sub(r'[\'"]', '', normalized)  # Remove quotes
        normalized = re.sub(r'\bthe\b', '', normalized)  # Remove 'the'
        return normalized
    
    def choose_best_entity(candidates):
        # Enhanced preference rules for entity selection
        preferences = [
            lambda x: 'global' in x or 'board' in x or 'relations' in x,  # Prefer comprehensive names
            lambda x: len(x.split()) > 1,  # Prefer longer names
            lambda x: not x.isdigit(),  # Prefer non-numeric
            lambda x: x  # Fallback to first candidate
        ]
        
        for pref in preferences:
            matches = [e for e in candidates if pref(e)]
            if matches:
                return max(matches, key=len)
    
    # Extract entity names from tuples if needed
    if entities and isinstance(entities[0], tuple):
        entities = [ent[0] for ent in entities]
    
    # Advanced fuzzy matching and grouping
    deduplicated = []
    while entities:
        current = entities.pop(0)
        norm_current = normalize_entity(current)
        
        matches = [
            ent for ent in entities 
            if (fuzz.ratio(normalize_entity(ent), norm_current) > 90 or 
                normalize_entity(ent) in norm_current or 
                norm_current in normalize_entity(ent))
        ]
        
        group = [current] + matches
        best_entity = choose_best_entity(group)
        deduplicated.append(best_entity)
        
        # Remove matched entities
        entities = [ent for ent in entities if ent not in matches]
    
    return list(set(deduplicated))

# Apply to your dataframe
data['deduplicated_entities'] = data['filtered_entities'].apply(advanced_entity_deduplication)

In [None]:
print(data['deduplicated_entities'][4])

Method 2

In [None]:
def extract_advanced_entities(text):
    doc = nlp(text)
    
    # Extract entities with confidence scoring
    entities = [
        {
            'text': ent.text.strip(), 
            'label': ent.label_,
            'confidence': round(ent._.confidence, 2) if hasattr(ent, '_.confidence') else 0.8
        } 
        for ent in doc.ents 
        if ent.label_ in ['ORG', 'GPE', 'PERSON'] #See whether to include FAC, etc
    ]
    
    # Remove duplicates while preserving most confident entries
    unique_entities = []
    seen = set()
    for entity in sorted(entities, key=lambda x: x['confidence'], reverse=True):
        normalized = re.sub(r'\s+', ' ', entity['text'].lower())
        if normalized not in seen:
            unique_entities.append(entity)
            seen.add(normalized)
    
    return unique_entities

data['entities'] = data['Text'].apply(extract_advanced_entities)

In [None]:
print(data['entities'][4])

# Extract Relationships

With spaCy

In [None]:
def extract_relationships(text):
    doc = nlp(text)
    relationships = []
    
    # Extract all entities, not just limiting to specific types
    entities = [ent.text for ent in doc.ents]
    
    for sent in doc.sents:
        for token in sent:
            # Broader dependency relations
            if token.dep_ in ['nsubj', 'dobj', 'pobj', 'attr', 'agent', 'nmod', 'conj', 'ROOT']:
                # Check if head or token is an entity
                if (token.head.text in entities or token.text in entities):
                    relationships.append({
                        'source': token.head.text,
                        'target': token.text,
                        'relation': token.dep_,
                        'sentence': sent.text
                    })
    
    return relationships

# Apply relationship extraction
data['relationships1'] = data['Text'].apply(extract_relationships)

# Print relationships to debug
print(data['relationships1'].iloc[6])

In [None]:
def extract_advanced_relationships(text):
    doc = nlp(text)
    relationships = []
    
    for sent in doc.sents:
        for token in sent:
            # Broader semantic relationship extraction
            if token.dep_ in ['nsubj', 'dobj', 'attr', 'agent', 'nmod', 'conj', 'ROOT']:
                relationships.append({
                    'source': token.head.text,
                    'target': token.text,
                    'relation': token.dep_,
                    'pos_source': token.head.pos_,
                    'pos_target': token.pos_
                })
    
    return relationships

data['relationships'] = data['Text'].apply(extract_advanced_relationships)

In [None]:
print(data['relationships'][0])

# Validate Data

In [None]:
def validate_data_quality(data):
    validation_report = {
        'total_documents': len(data),
        'documents_with_entities': sum(len(row['entities']) > 0 for row in data),
        'documents_with_relationships': sum(len(row['relationships']) > 0 for row in data),
        'avg_entities_per_doc': np.mean([len(row['entities']) for row in data]),
        'avg_relationships_per_doc': np.mean([len(row['relationships']) for row in data])
    }
    
    return validation_report


validation_results = validate_data_quality(data.to_dict('records'))
print("Data Validation Report:")
for key, value in validation_results.items():
    print(f"{key}: {value}")

# Plot Entity Network

In [None]:
def plot_entity_network(relationships):
    G = nx.DiGraph()
    
    # Add edges based on relationships
    for rel in relationships:
        G.add_edge(rel['source'], rel['target'], type=rel['relation'])
    
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=0.5, iterations=50)
    nx.draw_networkx_nodes(G, pos, node_color='lightblue', node_size=500, alpha=0.8)
    nx.draw_networkx_edges(G, pos, edge_color='gray', alpha=0.5)
    nx.draw_networkx_labels(G, pos, font_size=10)
    plt.title("Entity Relationship Network")
    plt.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
plot_entity_network(data['relationships'][5]) 

In [None]:
def plot_entity_distribution(entities):
    # Count entities by type
    entity_types = [entity['label'] for entity in entities]
    type_counts = Counter(entity_types)
    
    plt.figure(figsize=(10, 6))
    plt.bar(type_counts.keys(), type_counts.values())
    plt.title('Entity Type Distribution')
    plt.xlabel('Entity Type')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
plot_entity_distribution(data['entities'][0])

# Insights

In [None]:
def extract_key_insights(data):
    insights = {
        'top_entities': Counter([
            entity['text'] for doc_entities in data['entities'] 
            for entity in doc_entities
        ]).most_common(10),
        'most_common_relationships': Counter([
            rel['relation'] for doc_relationships in data['relationships'] 
            for rel in doc_relationships
        ]).most_common(5)
    }
    return insights

key_insights = extract_key_insights(data)
print("\nKey Insights:")
print("Top Entities:", key_insights['top_entities'])
print("Most Common Relationship Types:", key_insights['most_common_relationships'])