# Comprehensive NLP Pipeline with spaCy and NLTK

In [None]:

import spacy
import nltk
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tree import Tree
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:

# Download required NLTK data (run once)
def download_nltk_data():
    """Download necessary NLTK datasets"""
    nltk_downloads = [
        'punkt',
        'averaged_perceptron_tagger',
        'averaged_perceptron_tagger_eng',
        'maxent_ne_chunker',
        'maxent_ne_chunker_tab',
        'words',
        'treebank',
        'punkt_tab'
    ]
    
    for item in nltk_downloads:
        try:
            nltk.download(item, quiet=True)
            print(f"✓ Downloaded {item}")
        except Exception as e:
            print(f"✗ Failed to download {item}: {e}")


In [None]:

class NLPPipeline:
    """A comprehensive NLP Pipeline class that integrates multiple NLP tasks"""
    
    def __init__(self, use_spacy=True, use_nltk=True):
        self.use_spacy = use_spacy
        self.use_nltk = use_nltk
        
        # Initialize spaCy model
        if self.use_spacy:
            try:
                self.nlp_spacy = spacy.load("en_core_web_sm")
                print("✓ spaCy model loaded successfully")
            except OSError:
                print("✗ spaCy model 'en_core_web_sm' not found. Install with: python -m spacy download en_core_web_sm")
                self.use_spacy = False
        
        # Download NLTK data
        if self.use_nltk:
            download_nltk_data()
    
    def tokenize_text(self, text):
        results = {}
        if self.use_spacy:
            doc = self.nlp_spacy(text)
            results['spacy_tokens'] = [token.text for token in doc]
            results['spacy_sentences'] = [sent.text.strip() for sent in doc.sents]
        if self.use_nltk:
            results['nltk_tokens'] = word_tokenize(text)
            results['nltk_sentences'] = sent_tokenize(text)
        return results
    
    def pos_tagging(self, text):
        results = {}
        if self.use_spacy:
            doc = self.nlp_spacy(text)
            results['spacy_pos'] = [(token.text, token.pos_, token.tag_) for token in doc]
        if self.use_nltk:
            tokens = word_tokenize(text)
            results['nltk_pos'] = pos_tag(tokens)
        return results
    
    def named_entity_recognition(self, text):
        results = {}
        if self.use_spacy:
            doc = self.nlp_spacy(text)
            results['spacy_entities'] = [(ent.text, ent.label_, ent.start_char, ent.end_char) for ent in doc.ents]
        if self.use_nltk:
            tokens = word_tokenize(text)
            pos_tags = pos_tag(tokens)
            chunks = ne_chunk(pos_tags, binary=False)
            entities = []
            for chunk in chunks:
                if isinstance(chunk, Tree):
                    entity_name = ' '.join([token for token, pos in chunk.leaves()])
                    entity_type = chunk.label()
                    entities.append((entity_name, entity_type))
            results['nltk_entities'] = entities
        return results
    
    def dependency_parsing(self, text):
        if not self.use_spacy:
            return None
        doc = self.nlp_spacy(text)
        dependencies = []
        for token in doc:
            dependencies.append({
                'text': token.text,
                'dep': token.dep_,
                'head': token.head.text,
                'children': [child.text for child in token.children]
            })
        return dependencies
    
    def lemmatization(self, text):
        if not self.use_spacy:
            return None
        doc = self.nlp_spacy(text)
        return [(token.text, token.lemma_) for token in doc]
    
    def full_pipeline(self, text):
        return {
            'original_text': text,
            'tokenization': self.tokenize_text(text),
            'pos_tagging': self.pos_tagging(text),
            'ner': self.named_entity_recognition(text),
            'dependency_parsing': self.dependency_parsing(text),
            'lemmatization': self.lemmatization(text)
        }


In [None]:

def display_results(results):
    print(f"\n📝 ORIGINAL TEXT:")
    print(f"{results['original_text']}\n")
    
    print("🔤 TOKENIZATION RESULTS:")
    tokenization = results['tokenization']
    if 'spacy_tokens' in tokenization:
        print(f"spaCy Tokens ({len(tokenization['spacy_tokens'])}): {tokenization['spacy_tokens'][:10]}...")
        print(f"spaCy Sentences: {len(tokenization['spacy_sentences'])}")
    if 'nltk_tokens' in tokenization:
        print(f"NLTK Tokens ({len(tokenization['nltk_tokens'])}): {tokenization['nltk_tokens'][:10]}...")
        print(f"NLTK Sentences: {len(tokenization['nltk_sentences'])}")
    
    print(f"\n🏷️  POS TAGGING RESULTS:")
    pos_results = results['pos_tagging']
    if 'spacy_pos' in pos_results:
        print("spaCy POS Tags (first 10):")
        for word, pos, tag in pos_results['spacy_pos'][:10]:
            print(f"  {word:15} -> {pos:8} ({tag})")
    if 'nltk_pos' in pos_results:
        print("NLTK POS Tags (first 10):")
        for word, tag in pos_results['nltk_pos'][:10]:
            print(f"  {word:15} -> {tag}")
    
    print(f"\n🎯 NAMED ENTITY RECOGNITION:")
    ner_results = results['ner']
    if 'spacy_entities' in ner_results and ner_results['spacy_entities']:
        for entity, label, start, end in ner_results['spacy_entities']:
            print(f"  {entity:20} -> {label}")
    if 'nltk_entities' in ner_results and ner_results['nltk_entities']:
        for entity, label in ner_results['nltk_entities']:
            print(f"  {entity:20} -> {label}")
    
    if results['dependency_parsing']:
        print(f"\n🌳 DEPENDENCY PARSING (first 5 tokens):")
        for dep in results['dependency_parsing'][:5]:
            print(f"  {dep['text']:15} -> {dep['dep']:10} (head: {dep['head']})")
    
    if results['lemmatization']:
        print(f"\n📖 LEMMATIZATION (first 10 tokens):")
        for original, lemma in results['lemmatization'][:10]:
            if original != lemma:
                print(f"  {original:15} -> {lemma}")


In [None]:

def analyze_sample_texts():
    pipeline = NLPPipeline()
    sample_texts = [
        "Apple Inc. is planning to open a new store in New York City next month. CEO Tim Cook will attend the opening ceremony.",
        "The quick brown fox jumps over the lazy dog. This sentence contains every letter of the alphabet.",
        "Barack Obama was the 44th President of the United States. He served from 2009 to 2017 and was born in Hawaii."
    ]
    try:
        treebank_sample = ' '.join(treebank.words()[:50])
        sample_texts.append(treebank_sample)
        print("✓ Added Treebank sample")
    except Exception as e:
        print(f"✗ Could not load Treebank sample: {e}")
    for text in sample_texts:
        results = pipeline.full_pipeline(text)
        display_results(results)


In [None]:

def compare_libraries():
    pipeline = NLPPipeline()
    test_text = "Dr. John Smith from Harvard University published a paper about machine learning in Nature magazine."
    results = pipeline.full_pipeline(test_text)
    print("📊 TOKENIZATION COMPARISON:")
    print("spaCy:", results['tokenization'].get('spacy_tokens'))
    print("NLTK:", results['tokenization'].get('nltk_tokens'))
    print("\n🎯 NER COMPARISON:")
    print("spaCy:", results['ner'].get('spacy_entities'))
    print("NLTK:", results['ner'].get('nltk_entities'))


In [None]:

def create_pos_distribution_chart(results):
    pos_results = results['pos_tagging']
    if 'spacy_pos' in pos_results:
        pos_counts = defaultdict(int)
        for _, pos, _ in pos_results['spacy_pos']:
            pos_counts[pos] += 1
        plt.figure(figsize=(12,6))
        plt.bar(pos_counts.keys(), pos_counts.values(), color='skyblue', edgecolor='navy')
        plt.title('POS Tag Distribution (spaCy)')
        plt.xlabel('POS Tags')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.show()


In [None]:

def visualize_entities():
    try:
        import spacy
        from spacy import displacy
        nlp = spacy.load("en_core_web_sm")
        text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
        doc = nlp(text)
        html = displacy.render(doc, style="ent", jupyter=False)
        with open("entity_visualization.html", "w", encoding="utf-8") as f:
            f.write(html)
        print("✓ Entity visualization saved to 'entity_visualization.html'")
    except Exception as e:
        print("Could not visualize entities:", e)


In [None]:

print("🚀 Starting NLP Pipeline Demonstration")
analyze_sample_texts()
compare_libraries()
pipeline = NLPPipeline()
sample_text = "Natural language processing enables interaction between computers and human language."
results = pipeline.full_pipeline(sample_text)
create_pos_distribution_chart(results)
visualize_entities()
print("✅ NLP Pipeline Analysis Complete!")
