# Day 7 Lab 1: Amazon Comprehend - NLP for Banking

## üéØ Learning Objectives
- Understand Amazon Comprehend capabilities
- Perform sentiment analysis on customer feedback
- Extract entities and key phrases
- Detect language and PII
- Build banking NLP applications

## üè¶ Banking Use Case
Analyze **customer feedback and reviews** to understand sentiment, extract insights, and improve banking services.

## ‚è±Ô∏è Duration: 30 minutes
## üí∞ Cost: ~$0.10 (Comprehend API calls)

## Setup

In [None]:
import boto3
import json
import pandas as pd
from typing import List, Dict

# Initialize Comprehend client
comprehend = boto3.client('comprehend', region_name='us-east-1')

print("‚úÖ Amazon Comprehend client initialized")

## Part 1: Customer Feedback Dataset

In [None]:
# Sample customer feedback from SecureBank
customer_feedback = [
    {
        "id": 1,
        "text": "I absolutely love the new mobile banking app! It's so easy to transfer money and check my balance. The interface is intuitive and fast.",
        "channel": "App Store Review"
    },
    {
        "id": 2,
        "text": "Terrible experience at the branch today. Waited 45 minutes just to deposit a check. The staff was unhelpful and rude.",
        "channel": "Survey"
    },
    {
        "id": 3,
        "text": "The loan application process was straightforward. I applied online and got approved within 24 hours. Great service!",
        "channel": "Email"
    },
    {
        "id": 4,
        "text": "I'm disappointed with the high fees on my checking account. $15/month is too much compared to other banks.",
        "channel": "Social Media"
    },
    {
        "id": 5,
        "text": "The customer service team helped me resolve my issue quickly. They were professional and knowledgeable.",
        "channel": "Phone Survey"
    },
    {
        "id": 6,
        "text": "My credit card was declined at the store even though I had available credit. Very embarrassing situation.",
        "channel": "Complaint"
    }
]

print(f"üìä Loaded {len(customer_feedback)} customer feedback items\n")
for feedback in customer_feedback:
    print(f"ID {feedback['id']}: {feedback['text'][:60]}...")

## Part 2: Sentiment Analysis

In [None]:
def analyze_sentiment(text: str) -> Dict:
    """
    Analyze sentiment using Amazon Comprehend
    """
    response = comprehend.detect_sentiment(
        Text=text,
        LanguageCode='en'
    )
    
    return {
        'sentiment': response['Sentiment'],
        'positive': response['SentimentScore']['Positive'],
        'negative': response['SentimentScore']['Negative'],
        'neutral': response['SentimentScore']['Neutral'],
        'mixed': response['SentimentScore']['Mixed']
    }

# Analyze all feedback
print("üîç Analyzing Sentiment...\n")
print("="*80)

results = []
for feedback in customer_feedback:
    sentiment = analyze_sentiment(feedback['text'])
    
    print(f"\nID {feedback['id']}: {feedback['channel']}")
    print(f"Text: {feedback['text'][:80]}...")
    print(f"\nüìä Sentiment: {sentiment['sentiment']}")
    print(f"  Positive: {sentiment['positive']:.2%}")
    print(f"  Negative: {sentiment['negative']:.2%}")
    print(f"  Neutral: {sentiment['neutral']:.2%}")
    print(f"  Mixed: {sentiment['mixed']:.2%}")
    print("="*80)
    
    results.append({
        'id': feedback['id'],
        'channel': feedback['channel'],
        'sentiment': sentiment['sentiment'],
        'confidence': max(sentiment['positive'], sentiment['negative'], 
                         sentiment['neutral'], sentiment['mixed'])
    })

# Summary
df_results = pd.DataFrame(results)
print("\nüìà Sentiment Summary:")
print(df_results['sentiment'].value_counts())

## Part 3: Entity Extraction

In [None]:
def extract_entities(text: str) -> List[Dict]:
    """
    Extract entities (people, places, organizations, etc.)
    """
    response = comprehend.detect_entities(
        Text=text,
        LanguageCode='en'
    )
    
    entities = []
    for entity in response['Entities']:
        entities.append({
            'text': entity['Text'],
            'type': entity['Type'],
            'score': entity['Score']
        })
    
    return entities

# Extract entities from sample feedback
sample_text = """John Smith applied for a $50,000 personal loan at SecureBank's 
New York branch on January 15, 2024. He works at Microsoft and earns $120,000 annually."""

print("üîç Extracting Entities...\n")
print(f"Text: {sample_text}\n")

entities = extract_entities(sample_text)

print("üìä Extracted Entities:\n")
for entity in entities:
    print(f"  {entity['type']:15} | {entity['text']:20} | Confidence: {entity['score']:.2%}")

# Entity types
print("\nüí° Entity Types Detected:")
entity_types = set([e['type'] for e in entities])
for etype in entity_types:
    count = len([e for e in entities if e['type'] == etype])
    print(f"  - {etype}: {count}")

## Part 4: Key Phrase Extraction

In [None]:
def extract_key_phrases(text: str) -> List[Dict]:
    """
    Extract key phrases from text
    """
    response = comprehend.detect_key_phrases(
        Text=text,
        LanguageCode='en'
    )
    
    phrases = []
    for phrase in response['KeyPhrases']:
        phrases.append({
            'text': phrase['Text'],
            'score': phrase['Score']
        })
    
    return sorted(phrases, key=lambda x: x['score'], reverse=True)

# Extract key phrases from positive feedback
positive_feedback = customer_feedback[0]['text']

print("üîë Extracting Key Phrases...\n")
print(f"Text: {positive_feedback}\n")

key_phrases = extract_key_phrases(positive_feedback)

print("üìä Key Phrases (Top 10):\n")
for i, phrase in enumerate(key_phrases[:10], 1):
    print(f"  {i}. {phrase['text']:30} | Confidence: {phrase['score']:.2%}")

print("\nüí° Use Case: Identify common themes in customer feedback")

## Part 5: PII Detection (Personally Identifiable Information)

In [None]:
def detect_pii(text: str) -> List[Dict]:
    """
    Detect PII in text for compliance
    """
    response = comprehend.detect_pii_entities(
        Text=text,
        LanguageCode='en'
    )
    
    pii_entities = []
    for entity in response['Entities']:
        pii_entities.append({
            'type': entity['Type'],
            'score': entity['Score'],
            'begin': entity['BeginOffset'],
            'end': entity['EndOffset']
        })
    
    return pii_entities

# Sample text with PII
pii_text = """Customer John Doe (SSN: 123-45-6789) called from phone number 555-123-4567. 
His email is john.doe@email.com and he lives at 123 Main Street, New York, NY 10001. 
His credit card ending in 4532 was used for the transaction."""

print("üîí Detecting PII...\n")
print(f"Text: {pii_text}\n")

pii_entities = detect_pii(pii_text)

print("‚ö†Ô∏è PII Detected:\n")
for pii in pii_entities:
    extracted_text = pii_text[pii['begin']:pii['end']]
    print(f"  {pii['type']:20} | {extracted_text:25} | Confidence: {pii['score']:.2%}")

print("\nüí° Use Case: Redact PII before storing customer feedback")
print("\nüîê Compliance: GDPR, CCPA, PCI-DSS require PII protection")

## Part 6: Language Detection

In [None]:
def detect_language(text: str) -> Dict:
    """
    Detect dominant language in text
    """
    response = comprehend.detect_dominant_language(Text=text)
    
    languages = []
    for lang in response['Languages']:
        languages.append({
            'code': lang['LanguageCode'],
            'score': lang['Score']
        })
    
    return sorted(languages, key=lambda x: x['score'], reverse=True)

# Multi-language feedback
multilingual_feedback = [
    "I love this bank! Great service.",
    "Me encanta este banco. Excelente servicio.",
    "J'adore cette banque. Service excellent.",
    "Ich liebe diese Bank. Toller Service."
]

print("üåç Detecting Languages...\n")

for text in multilingual_feedback:
    languages = detect_language(text)
    primary = languages[0]
    
    print(f"Text: {text}")
    print(f"  Language: {primary['code'].upper()} (Confidence: {primary['score']:.2%})\n")

print("üí° Use Case: Route feedback to appropriate language support team")

## Part 7: Banking Insights Dashboard

In [None]:
# Analyze all feedback comprehensively
print("üìä SecureBank Customer Feedback Analysis Dashboard\n")
print("="*80)

# Sentiment breakdown
sentiment_counts = df_results['sentiment'].value_counts()
total = len(df_results)

print("\n1Ô∏è‚É£ Sentiment Distribution:")
for sentiment, count in sentiment_counts.items():
    percentage = (count / total) * 100
    bar = '‚ñà' * int(percentage / 5)
    print(f"  {sentiment:10} | {bar:20} | {count}/{total} ({percentage:.1f}%)")

# Channel analysis
print("\n2Ô∏è‚É£ Feedback by Channel:")
channel_sentiment = df_results.groupby(['channel', 'sentiment']).size().unstack(fill_value=0)
print(channel_sentiment)

# Action items
print("\n3Ô∏è‚É£ Recommended Actions:")
negative_count = sentiment_counts.get('NEGATIVE', 0)
positive_count = sentiment_counts.get('POSITIVE', 0)

if negative_count > 0:
    print(f"  ‚ö†Ô∏è {negative_count} negative feedback items require immediate attention")
    print("     - Review complaints and identify root causes")
    print("     - Contact customers for resolution")
    print("     - Implement process improvements")

if positive_count > 0:
    print(f"  ‚úÖ {positive_count} positive feedback items")
    print("     - Share success stories with team")
    print("     - Identify best practices to replicate")
    print("     - Use in marketing materials")

print("\n4Ô∏è‚É£ Key Metrics:")
print(f"  Customer Satisfaction Score: {(positive_count / total * 100):.1f}%")
print(f"  Response Required: {negative_count} items")
print(f"  Average Confidence: {df_results['confidence'].mean():.2%}")

print("\n="*80)

## Part 8: Production Use Cases

In [None]:
print("üè≠ Production Use Cases for Amazon Comprehend in Banking:\n")

use_cases = {
    "Use Case": [
        "Customer Feedback Analysis",
        "Complaint Prioritization",
        "Fraud Detection",
        "Loan Application Processing",
        "Compliance Monitoring",
        "Chatbot Intent Detection"
    ],
    "Comprehend Feature": [
        "Sentiment Analysis",
        "Sentiment + Key Phrases",
        "Entity Extraction",
        "Entity + PII Detection",
        "PII Detection",
        "Key Phrases + Entities"
    ],
    "Business Impact": [
        "Improve customer satisfaction",
        "Faster response times",
        "Reduce fraud losses",
        "Automate document review",
        "Ensure regulatory compliance",
        "Better customer service"
    ],
    "Cost Savings": [
        "$50K/year",
        "$30K/year",
        "$200K/year",
        "$100K/year",
        "Avoid fines",
        "$40K/year"
    ]
}

df_use_cases = pd.DataFrame(use_cases)
print(df_use_cases.to_string(index=False))

print("\nüí∞ Cost Analysis:")
print("  - Comprehend: $0.0001 per unit (100 characters)")
print("  - 1M customer feedback items: ~$100")
print("  - Manual analysis cost: ~$50,000")
print("  - ROI: 500x")

print("\n‚ö° Performance:")
print("  - Latency: < 100ms per request")
print("  - Throughput: 1000s of requests/second")
print("  - Accuracy: 90-95% for sentiment")
print("  - Languages: 100+ supported")

## Summary

In [None]:
print("\n‚úÖ Lab 1 Complete: Amazon Comprehend NLP\n")
print("üéì What You Learned:")
print("  1. Sentiment analysis for customer feedback")
print("  2. Entity extraction (people, places, organizations)")
print("  3. Key phrase extraction for themes")
print("  4. PII detection for compliance")
print("  5. Language detection for routing")
print("  6. Building insights dashboards")
print("\nüí° Key Takeaways:")
print("  - Comprehend is serverless and fully managed")
print("  - No ML expertise required")
print("  - Pay only for what you use")
print("  - Scales automatically")
print("  - 100+ languages supported")
print("\nüöÄ Next Steps:")
print("  - Integrate with your feedback systems")
print("  - Set up automated analysis pipelines")
print("  - Build real-time dashboards")
print("  - Implement PII redaction")