In [None]:
# Install required libraries in Google Colab
!pip install spacy textblob ipywidgets
!python -m spacy download en_core_web_sm

import spacy
from textblob import TextBlob
import ipywidgets as widgets
from IPython.display import display, HTML
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Placeholder article (replace with your copied article from an online news source)
article_text = """
London has launched a bold initiative to become a zero-waste city by 2030, announced Mayor Sadiq Khan on June 27, 2025. The plan focuses on reducing plastic use, expanding recycling programs, and promoting sustainable consumer habits. Khan highlighted the environmental impact of waste, noting that landfills contribute significantly to greenhouse gas emissions. Over 200 local businesses, including GreenTech Solutions, have committed £5 million to support the initiative. Environmental groups welcomed the move but called for stronger measures against single-use plastics. Critics argue the plan may strain small businesses due to high compliance costs. The initiative aligns with the UK’s broader climate goals, following recent EU regulations. Public response has been mixed, with some residents praising the effort and others concerned about practical implementation.
"""

# Function to perform NLP analysis
def analyze_article(article):
    # Process the article with spaCy
    doc = nlp(article)

    # Tokenization
    tokens = [token.text for token in doc]
    sentences = [sent.text for sent in doc.sents]

    # Stopword Removal
    stopwords = spacy.lang.en.stop_words.STOP_WORDS
    filtered_tokens = [token.text for token in doc if token.text.lower() not in stopwords and token.is_alpha]

    # Named Entity Recognition (NER)
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Part-of-Speech (POS) Tagging
    pos_tags = [(token.text, token.pos_) for token in doc]

    # Sentiment Analysis
    blob = TextBlob(article)
    sentiment = {
        "polarity": blob.sentiment.polarity,  # Range: -1 (negative) to 1 (positive)
        "subjectivity": blob.sentiment.subjectivity  # Range: 0 (objective) to 1 (subjective)
    }

    return {
        "tokens": tokens[:20],  # Limit for display
        "sentences": sentences[:3],  # Limit for display
        "filtered_tokens": filtered_tokens[:20],  # Limit for display
        "entities": entities,
        "pos_tags": pos_tags[:20],  # Limit for display
        "sentiment": sentiment
    }

# GUI Components
article_input = widgets.Textarea(
    value=article_text,
    placeholder='Paste your newspaper article here (150–300 words, copied from an online source)',
    description='Article:',
    layout={'width': '600px', 'height': '200px'}
)

analyze_button = widgets.Button(
    description='Analyze Article',
    button_style='primary',
    tooltip='Click to analyze the pasted article'
)

output_area = widgets.Output()

# Function to handle button click
def on_analyze_button_clicked(b):
    with output_area:
        output_area.clear_output()
        article = article_input.value

        # Validate article length
        word_count = len(re.findall(r'\w+', article))
        if not (150 <= word_count <= 300):
            display(HTML("<b>Error:</b> Article must be 150–300 words. Current word count: {}".format(word_count)))
            return

        # Perform analysis
        results = analyze_article(article)

        # Display results
        display(HTML("<h3>Analysis Results</h3>"))

        display(HTML("<b>1. Tokenization (First 20 Tokens):</b>"))
        display(HTML("<p>{}</p>".format(", ".join(results["tokens"]))))

        display(HTML("<b>2. Sentences (First 3 Sentences):</b>"))
        for i, sent in enumerate(results["sentences"], 1):
            display(HTML("<p>Sentence {}: {}</p>".format(i, sent)))

        display(HTML("<b>3. Filtered Tokens (No Stopwords, First 20):</b>"))
        display(HTML("<p>{}</p>".format(", ".join(results["filtered_tokens"]))))

        display(HTML("<b>4. Named Entity Recognition (NER):</b>"))
        for entity, label in results["entities"]:
            display(HTML("<p>Entity: {} ({})</p>".format(entity, label)))

        display(HTML("<b>5. Part-of-Speech (POS) Tagging (First 20):</b>"))
        for token, pos in results["pos_tags"]:
            display(HTML("<p>Token: {} ({})</p>".format(token, pos)))

        display(HTML("<b>6. Sentiment Analysis:</b>"))
        polarity = results["sentiment"]["polarity"]
        subjectivity = results["sentiment"]["subjectivity"]
        sentiment_label = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"
        display(HTML("<p>Polarity: {:.2f} ({})</p>".format(polarity, sentiment_label)))
        display(HTML("<p>Subjectivity: {:.2f} ({})</p>".format(subjectivity, "Subjective" if subjectivity > 0.5 else "Objective")))

# Link button to function
analyze_button.on_click(on_analyze_button_clicked)

# Display GUI
display(HTML("<h2>NLP Analysis of Newspaper Article</h2>"))
display(HTML("<p>Paste an article (150–300 words) copied from an online news source into the text box below.</p>"))
display(article_input)
display(analyze_button)
display(output_area)

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kern

Textarea(value='\nLondon has launched a bold initiative to become a zero-waste city by 2030, announced Mayor S…

Button(button_style='primary', description='Analyze Article', style=ButtonStyle(), tooltip='Click to analyze t…

Output()

In [None]:
# NLP Article Analyzer with GUI for Google Colab
# This program analyzes newspaper articles using various NLP techniques

# Install required packages (run this cell first in Colab)
"""
!pip install nltk spacy textblob ipywidgets matplotlib seaborn wordcloud
!python -m spacy download en_core_web_sm
"""

import nltk
import spacy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
from collections import Counter
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import re

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')  # Additional required resource
nltk.download('stopwords')
nltk.download('pos_tag')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

class NLPAnalyzer:
    def __init__(self):
        self.stopwords = set(nltk.corpus.stopwords.words('english'))
        self.article_text = ""

    def tokenize_text(self, text):
        """Tokenize text into sentences and words"""
        # Sentence tokenization
        sentences = nltk.sent_tokenize(text)

        # Word tokenization
        words = nltk.word_tokenize(text)

        # Remove punctuation and convert to lowercase
        words_clean = [word.lower() for word in words if word.isalnum()]

        return sentences, words, words_clean

    def remove_stopwords(self, words):
        """Remove stopwords from word list"""
        filtered_words = [word for word in words if word.lower() not in self.stopwords]
        return filtered_words

    def named_entity_recognition(self, text):
        """Extract named entities using spaCy"""
        doc = nlp(text)
        entities = []

        for ent in doc.ents:
            entities.append({
                'text': ent.text,
                'label': ent.label_,
                'description': spacy.explain(ent.label_)
            })

        return entities

    def pos_tagging(self, text):
        """Perform Part-of-Speech tagging"""
        doc = nlp(text)
        pos_tags = []

        for token in doc:
            if not token.is_punct and not token.is_space:
                pos_tags.append({
                    'word': token.text,
                    'pos': token.pos_,
                    'description': spacy.explain(token.pos_)
                })

        return pos_tags

    def sentiment_analysis(self, text):
        """Analyze sentiment using TextBlob"""
        blob = TextBlob(text)

        # Get polarity and subjectivity
        polarity = blob.sentiment.polarity  # -1 to 1
        subjectivity = blob.sentiment.subjectivity  # 0 to 1

        # Determine sentiment label
        if polarity > 0.1:
            sentiment_label = "Positive"
        elif polarity < -0.1:
            sentiment_label = "Negative"
        else:
            sentiment_label = "Neutral"

        return {
            'polarity': polarity,
            'subjectivity': subjectivity,
            'sentiment': sentiment_label
        }

    def generate_wordcloud(self, words):
        """Generate word cloud from filtered words"""
        text_for_cloud = ' '.join(words)

        if text_for_cloud.strip():
            wordcloud = WordCloud(width=800, height=400,
                                background_color='white',
                                max_words=50).generate(text_for_cloud)

            plt.figure(figsize=(10, 5))
            plt.imshow(wordcloud, interpolation='bilinear')
            plt.axis('off')
            plt.title('Word Cloud of Article (After Stopword Removal)')
            plt.tight_layout()
            plt.show()

    def visualize_pos_distribution(self, pos_tags):
        """Visualize POS tag distribution"""
        pos_counts = Counter([tag['pos'] for tag in pos_tags])

        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        plt.bar(pos_counts.keys(), pos_counts.values())
        plt.title('Part-of-Speech Distribution')
        plt.xlabel('POS Tags')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)

        plt.subplot(1, 2, 2)
        plt.pie(pos_counts.values(), labels=pos_counts.keys(), autopct='%1.1f%%')
        plt.title('POS Distribution (Pie Chart)')

        plt.tight_layout()
        plt.show()

    def visualize_entities(self, entities):
        """Visualize named entities"""
        if entities:
            entity_counts = Counter([ent['label'] for ent in entities])

            plt.figure(figsize=(10, 6))
            plt.bar(entity_counts.keys(), entity_counts.values())
            plt.title('Named Entity Types Distribution')
            plt.xlabel('Entity Types')
            plt.ylabel('Count')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

# Create the GUI interface
def create_nlp_gui():
    analyzer = NLPAnalyzer()

    # Sample newspaper article
    sample_article = """
    Climate Change Summit Concludes with Historic Agreement in Paris

    PARIS, France - World leaders concluded a groundbreaking climate summit yesterday with the signing of the Paris Climate Accord, marking a pivotal moment in global environmental policy. The agreement, signed by representatives from 195 countries, establishes ambitious targets for reducing carbon emissions and limiting global temperature rise to 1.5 degrees Celsius above pre-industrial levels.

    President Emmanuel Macron of France praised the accord as "a testament to international cooperation and our collective commitment to future generations." The European Union has pledged to reduce emissions by 55% by 2030, while the United States committed to achieving net-zero emissions by 2050.

    Environmental activists, including Greta Thunberg, expressed cautious optimism about the agreement but emphasized the need for immediate action. "Words must be followed by concrete measures," Thunberg stated during a press conference outside the Louvre Museum.

    The summit also addressed funding for developing nations, with developed countries promising $100 billion annually to support climate adaptation and mitigation efforts. Critics argue that this amount is insufficient given the scale of the climate crisis.

    Financial markets responded positively to the news, with renewable energy stocks surging 15% on the Paris Stock Exchange. Tesla and other electric vehicle manufacturers saw significant gains, reflecting investor confidence in the transition to clean energy.

    Scientists from the Intergovernmental Panel on Climate Change (IPCC) warned that despite the agreement, urgent action is needed to prevent catastrophic climate impacts. Dr. Sarah Johnson, a climatologist at Oxford University, noted that "we have a narrow window of opportunity to implement these commitments effectively."
    """

    # Create widgets
    text_area = widgets.Textarea(
        value=sample_article,
        placeholder='Paste your newspaper article here (150-300 words minimum)',
        description='Article:',
        layout=widgets.Layout(width='100%', height='200px')
    )

    analyze_button = widgets.Button(
        description='Analyze Article',
        button_style='success',
        layout=widgets.Layout(width='200px')
    )

    output_area = widgets.Output()

    def analyze_article(b):
        with output_area:
            clear_output()

            article_text = text_area.value.strip()

            if len(article_text.split()) < 50:
                print("⚠️ Please enter an article with at least 50 words for meaningful analysis.")
                return

            print("🔍 ANALYZING ARTICLE...")
            print("=" * 80)

            # Store article text
            analyzer.article_text = article_text

            # 1. Tokenization
            print("\n📝 1. TOKENIZATION")
            print("-" * 40)
            sentences, words, words_clean = analyzer.tokenize_text(article_text)

            print(f"Number of sentences: {len(sentences)}")
            print(f"Total words: {len(words)}")
            print(f"Words (after cleaning): {len(words_clean)}")
            print(f"\nFirst 3 sentences:")
            for i, sent in enumerate(sentences[:3], 1):
                print(f"{i}. {sent}")

            print(f"\nFirst 20 words: {words[:20]}")

            # 2. Stopword Removal
            print("\n🚫 2. STOPWORD REMOVAL")
            print("-" * 40)
            filtered_words = analyzer.remove_stopwords(words_clean)
            print(f"Words before stopword removal: {len(words_clean)}")
            print(f"Words after stopword removal: {len(filtered_words)}")
            print(f"Stopwords removed: {len(words_clean) - len(filtered_words)}")
            print(f"Top 15 words after filtering: {filtered_words[:15]}")

            # Generate word cloud
            analyzer.generate_wordcloud(filtered_words)

            # 3. Named Entity Recognition
            print("\n👥 3. NAMED ENTITY RECOGNITION (NER)")
            print("-" * 40)
            entities = analyzer.named_entity_recognition(article_text)

            if entities:
                print(f"Found {len(entities)} named entities:")

                # Group entities by type
                entity_groups = {}
                for ent in entities:
                    if ent['label'] not in entity_groups:
                        entity_groups[ent['label']] = []
                    entity_groups[ent['label']].append(ent['text'])

                for label, texts in entity_groups.items():
                    description = spacy.explain(label) or label
                    unique_texts = list(set(texts))  # Remove duplicates
                    print(f"  {label} ({description}): {', '.join(unique_texts)}")

                # Visualize entities
                analyzer.visualize_entities(entities)
            else:
                print("No named entities found.")

            # 4. Part-of-Speech Tagging
            print("\n🏷️ 4. PART-OF-SPEECH (POS) TAGGING")
            print("-" * 40)
            pos_tags = analyzer.pos_tagging(article_text)

            # Show sample POS tags
            print("Sample POS tags (first 15 words):")
            for tag in pos_tags[:15]:
                desc = tag['description'] or tag['pos']
                print(f"  '{tag['word']}' -> {tag['pos']} ({desc})")

            # Count POS frequencies
            pos_counts = Counter([tag['pos'] for tag in pos_tags])
            print(f"\nPOS Distribution:")
            for pos, count in pos_counts.most_common():
                desc = spacy.explain(pos) or pos
                print(f"  {pos} ({desc}): {count}")

            # Visualize POS distribution
            analyzer.visualize_pos_distribution(pos_tags)

            # 5. Sentiment Analysis
            print("\n😊 5. SENTIMENT ANALYSIS")
            print("-" * 40)
            sentiment = analyzer.sentiment_analysis(article_text)

            print(f"Overall Sentiment: {sentiment['sentiment']}")
            print(f"Polarity Score: {sentiment['polarity']:.3f} (Range: -1 to +1)")
            print(f"Subjectivity Score: {sentiment['subjectivity']:.3f} (Range: 0 to 1)")

            # Interpret scores
            print(f"\nInterpretation:")
            if sentiment['polarity'] > 0:
                print(f"  • The article has a positive tone")
            elif sentiment['polarity'] < 0:
                print(f"  • The article has a negative tone")
            else:
                print(f"  • The article has a neutral tone")

            if sentiment['subjectivity'] > 0.5:
                print(f"  • The article is quite subjective/opinionated")
            else:
                print(f"  • The article is relatively objective/factual")

            # Sentiment visualization
            plt.figure(figsize=(10, 4))

            plt.subplot(1, 2, 1)
            colors = ['red' if sentiment['polarity'] < 0 else 'green' if sentiment['polarity'] > 0 else 'gray']
            plt.bar(['Polarity'], [sentiment['polarity']], color=colors)
            plt.ylim(-1, 1)
            plt.title('Sentiment Polarity')
            plt.ylabel('Score')

            plt.subplot(1, 2, 2)
            plt.bar(['Subjectivity'], [sentiment['subjectivity']], color='blue')
            plt.ylim(0, 1)
            plt.title('Subjectivity Score')
            plt.ylabel('Score')

            plt.tight_layout()
            plt.show()

            # Summary
            print("\n📊 ANALYSIS SUMMARY")
            print("=" * 80)
            print(f"Article Length: {len(words)} words, {len(sentences)} sentences")
            print(f"Vocabulary Richness: {len(set(words_clean))} unique words")
            print(f"Named Entities: {len(entities)} found")
            print(f"Most Common POS: {pos_counts.most_common(1)[0][0] if pos_counts else 'N/A'}")
            print(f"Sentiment: {sentiment['sentiment']} (Polarity: {sentiment['polarity']:.3f})")

            print(f"\n✅ Analysis complete! The article demonstrates:")
            print(f"  • Tokenization: Text broken into {len(sentences)} sentences and {len(words)} words")
            print(f"  • Stopword filtering: {len(words_clean) - len(filtered_words)} common words removed")
            print(f"  • Entity recognition: {len(entities)} named entities identified")
            print(f"  • POS tagging: {len(pos_tags)} words tagged with grammatical roles")
            print(f"  • Sentiment analysis: {sentiment['sentiment']} tone detected")

    # Connect button to function
    analyze_button.on_click(analyze_article)

    # Display GUI
    print("🎯 NLP ARTICLE ANALYZER")
    print("=" * 50)
    print("This tool demonstrates core NLP concepts:")
    print("• Tokenization • Stopword Removal • Named Entity Recognition")
    print("• Part-of-Speech Tagging • Sentiment Analysis")
    print("\nInstructions:")
    print("1. Paste your newspaper article (150-300 words) in the text area below")
    print("2. Click 'Analyze Article' to see the NLP analysis")
    print("3. The analysis will show results for all NLP techniques with visualizations")
    print("\n" + "=" * 50)

    display(text_area)
    display(analyze_button)
    display(output_area)

# Run the GUI
if __name__ == "__main__":
    create_nlp_gui()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading pos_tag: Package 'pos_tag' not found in
[nltk_data]     index
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


🎯 NLP ARTICLE ANALYZER
This tool demonstrates core NLP concepts:
• Tokenization • Stopword Removal • Named Entity Recognition
• Part-of-Speech Tagging • Sentiment Analysis

Instructions:
1. Paste your newspaper article (150-300 words) in the text area below
2. Click 'Analyze Article' to see the NLP analysis
3. The analysis will show results for all NLP techniques with visualizations



Textarea(value='\n    Climate Change Summit Concludes with Historic Agreement in Paris\n    \n    PARIS, Franc…

Button(button_style='success', description='Analyze Article', layout=Layout(width='200px'), style=ButtonStyle(…

Output()