In [33]:
import pandas as pd
# Extract the 'Article' column and convert it into a list
data = pd.read_csv("Articles.csv", encoding='latin1')
news_articles = data['Article'].tolist()
news_articles = news_articles[:50]

In [34]:
import json
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, EntitiesOptions
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from google.cloud import language_v1
import boto3
from dotenv import load_dotenv
import os

load_dotenv()
IBM_API_KEY = os.getenv("IBM_API_KEY")
IBM_URL = os.getenv("IBM_URL")

GOOGLE_API_CREDENTIALS_PATH = os.getenv("GOOGLE_API_CREDENTIALS_PATH")

AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
AWS_REGION = os.getenv("AWS_REGION")

In [35]:
# IBM Watson NLU Entity Extraction
def analyze_entities_ibm(article):
    # Initialize the IAM authenticator
    authenticator = IAMAuthenticator(IBM_API_KEY)
    
    # Initialize the Natural Language Understanding client
    nlu = NaturalLanguageUnderstandingV1(
        version='2021-08-01',
        authenticator=authenticator
    )
    nlu.set_service_url(IBM_URL)

    # Perform the analysis
    response = nlu.analyze(
        text=article,
        features=Features(entities=EntitiesOptions())
    ).get_result()

    # Extract and return entities
    entities = response.get('entities', [])
    return [(entity['text'], entity['type']) for entity in entities]


# Google Cloud NLP Entity Extraction
def analyze_entities_google(article):
    client = language_v1.LanguageServiceClient.from_service_account_json(GOOGLE_API_CREDENTIALS_PATH)

    document = language_v1.Document(content=article, type_=language_v1.Document.Type.PLAIN_TEXT)
    response = client.analyze_entities(document=document)

    entities = [(entity.name, language_v1.Entity.Type(entity.type_).name) for entity in response.entities]
    return entities

# AWS Comprehend Entity Extraction
def analyze_entities_aws(article):
    client = boto3.client(
        'comprehend',
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION
    )

    response = client.detect_entities(
        Text=article,
        LanguageCode='en'
    )

    entities = [(entity['Text'], entity['Type']) for entity in response['Entities']]
    return entities

In [40]:
def normalize_text(text):
    if isinstance(text, str):
        return text.lower().strip()
    return text

def evaluate_results(extracted_results, ground_truth):
    total_correct = 0
    total_ground_truth = 0

    for extracted, truth in zip(extracted_results, ground_truth):
        # Normalize extracted entities (tuple: text + type)
        extracted_set = {normalize_text(e[0]) for e in extracted}
        # Normalize ground truth entities (strings)
        truth_set = {normalize_text(t) for t in truth}

        # Count correct matches
        total_correct += len(extracted_set & truth_set)
        total_ground_truth += len(truth_set)

    # Avoid division by zero
    if total_ground_truth == 0:
        return 0

    # Calculate accuracy
    accuracy = (total_correct / total_ground_truth) * 100
    return accuracy

In [37]:
ibm_results = []
for article in news_articles:
    ibm_entities = analyze_entities_ibm(article)

    ibm_results.append(ibm_entities)


KeyboardInterrupt: 

In [41]:
google_results = []
for article in news_articles:
    google_entities = analyze_entities_google(article)

    google_results.append(google_entities)

KeyboardInterrupt: 

In [23]:
aws_results = []
for article in news_articles:
    aws_entities = analyze_entities_aws(article)

    aws_results.append(aws_entities)

In [31]:
import spacy

nlp = spacy.load("en_core_web_sm")

ground_truth = []

for article in news_articles:
    doc = nlp(article)
    
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    gpes = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    
    combined_entities = orgs + gpes
    
    ground_truth.append(combined_entities)


[['KARACHI', 'Sindh', 'Geo News', 'Karachi Transport Ittehad', 'KTI', 'KTI', 'Compressed Natural Gas', 'Karachi'], ['the People´s Insurance Company (Group', 'China Ltd.', 'CSR Corp.', 'China CNR Corp.', 'CNR', 'PMI', 'the National Bureau of Statistics', 'NBS', 'NBS', 'euro', 'the European Central Bank', 'ECB', 'Brent', 'Phillip Futures', 'the US Energy Information Administration', 'CMC Markets', 'Indo Tambangraya Megah', 'Public Bank', 'Top Globe', 'Wilmar International', 'Sg$3.27', 'Capitaland', 'Sg$3.30.-- Mumbai', '27,887.90.Housing Development Finance Corp', 'Mahindra & Mahindra', 'AFP', 'HONG KONG', 'Hong Kong', 'Beijing', 'Hong Kong', 'Sydney', 'China', 'Japan', 'Taiwan', 'New Zealand', 'Philippines', 'Thailand', 'Hong Kong', 'Beijing', 'China', 'China', 'US', 'US', 'US', 'US', 'Singapore', 'Sydney', 'Jakarta', 'Singapore'], ['HONG KONG', 'Hong Kong'], ['IMF-EU', 'IG Ltd.', 'Bloomberg News', 'Nasdaq', 'the European Central Bank', 'Brent', 'the Organization of the Petroleum Export

In [42]:
ibm_accuracy = evaluate_results(ibm_results, ground_truth)
google_accuracy = evaluate_results(google_results, ground_truth)
aws_accuracy = evaluate_results(aws_results, ground_truth)

In [43]:
print(f"IBM Watson NLU Accuracy: {ibm_accuracy:.2f}%")
print(f"Google Cloud NLP Accuracy: {google_accuracy:.2f}%")
print(f"AWS Comprehend Accuracy: {aws_accuracy:.2f}%")

IBM Watson NLU Accuracy: 43.71%
Google Cloud NLP Accuracy: 66.34%
AWS Comprehend Accuracy: 70.65%
