In [9]:
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_watson.natural_language_understanding_v1 import Features, KeywordsOptions
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from google.cloud import language_v1
import boto3
from dotenv import load_dotenv
import os

load_dotenv()
IBM_API_KEY = os.getenv("IBM_API_KEY")
IBM_URL = os.getenv("IBM_URL")

GOOGLE_API_CREDENTIALS_PATH = os.getenv("GOOGLE_API_CREDENTIALS_PATH")

AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
AWS_REGION = os.getenv("AWS_REGION")

In [1]:
import pandas as pd
# Extract the 'Article' column and convert it into a list
data = pd.read_csv("tweets.csv", encoding='latin1')
tweets = data['content'].tolist()
tweets = tweets[:50]
tweets

['Police: Airline pilot found passed out in cockpit was drunk https://t.co/gCqwIOESWJ https://t.co/2agimSVzmK',
 'Dashcam video shows man launching himself onto police cruiser https://t.co/2ZGpBAu4ey https://t.co/B2fKNFpY5V',
 'Man Arrested For Setting Fire To South SF Medical Clinic New Yearâ\x80\x99s Day https://t.co/MaQc2gZVhv https://t.co/wgxyh0iFTO',
 "Practical tips to achieving your New Year's resolutions in 2017 https://t.co/LdtEP7Z3JO https://t.co/r5Qvfu06ex",
 'Couple disappears during trip to Big Sur https://t.co/81xYH8ULy2 https://t.co/QbF2bOQnOR',
 'Vacaville Police Arrest Man Suspected Of Welfare Fraud, Running Drug House https://t.co/UJuyVi5SCZ https://t.co/uCNkrKDUVD',
 'VIDEO: Female Lyft driver violently attacked over parking spot in Santa Ana https://t.co/5oebOWZUAO https://t.co/gM3RsW4vkA',
 'Sex offender arrested for allegedly exposing himself to 2 women in SSF https://t.co/7w0u1siMv8 https://t.co/2TnkMEYqUb',
 '#7Things to know before you go from ABC7 News: Monday

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

ground_truth = []

for tweet in tweets:
    doc = nlp(tweet)
    
    orgs = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    gpes = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
    
    combined_entities = orgs + gpes
    
    ground_truth.append(combined_entities)



In [6]:
# IBM Watson NLU Keyword Extraction
def analyze_keywords_ibm(tweet):
    authenticator = IAMAuthenticator(IBM_API_KEY)
    nlu = NaturalLanguageUnderstandingV1(
        version="2021-08-01",
        authenticator=authenticator
    )
    nlu.set_service_url(IBM_URL)

    response = nlu.analyze(
        text=tweet,
        features=Features(keywords=KeywordsOptions(limit=5))
    ).get_result()

    keywords = [kw['text'] for kw in response['keywords']]
    return keywords

# Google Cloud NLP Keyword Extraction
def analyze_keywords_google(tweet):
    client = language_v1.LanguageServiceClient.from_service_account_json(GOOGLE_API_CREDENTIALS_PATH)

    document = language_v1.Document(content=tweet, type_=language_v1.Document.Type.PLAIN_TEXT)
    response = client.analyze_entities(document=document)

    keywords = [entity.name for entity in response.entities]
    return keywords

# AWS Comprehend Keyword Extraction (via entities)
def analyze_keywords_aws(tweet):
    client = boto3.client(
        "comprehend",
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION
    )

    response = client.detect_entities(
        Text=tweet,
        LanguageCode="en"
    )

    keywords = [entity['Text'] for entity in response['Entities']]
    return keywords

In [7]:
# Evaluate Results
def evaluate_keywords(extracted_keywords, ground_truth):
    correct = 0
    for extracted, truth in zip(extracted_keywords, ground_truth):
        extracted_set = set([kw.lower() for kw in extracted])
        truth_set = set([kw.lower() for kw in truth])
        correct += len(extracted_set & truth_set)

    accuracy = (correct / sum(len(gt) for gt in ground_truth)) * 100
    return accuracy

In [10]:
ibm_results = [analyze_keywords_ibm(tweet) for tweet in tweets]
ibm_accuracy = evaluate_keywords(ibm_results, ground_truth)

In [11]:
google_results = [analyze_keywords_google(tweet) for tweet in tweets]
google_accuracy = evaluate_keywords(google_results, ground_truth)

In [12]:
aws_results = [analyze_keywords_aws(tweet) for tweet in tweets]
aws_accuracy = evaluate_keywords(aws_results, ground_truth)

In [13]:
print(f"IBM Watson NLU Accuracy: {ibm_accuracy:.2f}%")

print(f"Google Cloud NLP Accuracy: {google_accuracy:.2f}%")

print(f"AWS Comprehend Accuracy: {aws_accuracy:.2f}%")

IBM Watson NLU Accuracy: 50.00%
Google Cloud NLP Accuracy: 59.52%
AWS Comprehend Accuracy: 73.81%
