<a href="https://colab.research.google.com/github/stillrahim/jupyter-exploration/blob/main/L05_Bah_Ibrahim_ITAI2373.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Lab 05 — Part-of-Speech Tagging in the Real World
**Name:** Ibrahim Bah
**Course:** ITAI 2373
**Notebook:** L05_LastName_FirstName_ITAI2373.ipynb



In [None]:
# ======= Setup =======
# Colab-friendly installs (runs quickly)
!pip install -q nltk spacy pandas matplotlib seaborn
!python -m spacy download en_core_web_sm

# Imports
import nltk
import spacy
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk import word_tokenize, pos_tag
from nltk.corpus import treebank, brown
from collections import Counter, defaultdict
from nltk.tag import map_tag

# NLTK data download
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')
nltk.download('brown')
nltk.download('treebank')


In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

sample_sentences = [
    "Apple is releasing a new product next week.",
    "I will book a flight tomorrow, then read a book in the evening.",
    "Can you help me reset my password? I'm locked out!",
    "The quick brown fox jumps over the lazy dog."
]

print("Sample sentences:\n")
for s in sample_sentences:
    print("-", s)


In [None]:
# NLTK tagging
print("\n--- NLTK tagging (Penn Treebank tags) ---")
for s in sample_sentences:
    tokens = word_tokenize(s)
    tags = pos_tag(tokens)  # Penn Treebank by default
    print(s)
    print(tags)
    print()

# SpaCy tagging
print("\n--- SpaCy tagging (tag_ and pos_) ---")
for s in sample_sentences:
    doc = nlp(s)
    print(s)
    print([(token.text, token.tag_, token.pos_) for token in doc])
    print()


In [None]:
### Map PTB tags to Universal tagset for fairer comparison
NLTK provides `map_tag` to map PTB tags to the Universal tagset. SpaCy also provides `pos_` (Universal-style coarse POS).


In [None]:
def nltk_to_universal(tagged):
    # tagged: list of (word, PTB tag)
    return [(w, map_tag('en-ptb', 'universal', t)) for w, t in tagged]

print("NLTK -> Universal mapping examples:")
for s in sample_sentences:
    tokens = word_tokenize(s)
    tags = pos_tag(tokens)
    print(nltk_to_universal(tags))


In [None]:
## Part 2 — Handling Messy, Real-World Text
We'll demonstrate pre-processing of messy text (social-media-like), then tag it.


In [None]:
messy_texts = [
    "OMG!!! I can't login :(( help pls!!! #frustrated",
    "thx 4 the quick reply. ill try again :)",
    "Order #1234 not delivered yet!!! wtf?",
    "Got it—thx. all good ✅"
]

def clean_for_tagging(s):
    # basic cleaning for demonstration; keep emoticons for analysis
    s = s.replace("#", " #")
    return s

for s in messy_texts:
    s_clean = clean_for_tagging(s)
    print("Original:", s)
    print("NLTK tokens/tags:", pos_tag(word_tokenize(s_clean)))
    doc = nlp(s_clean)
    print("SpaCy tokens/tags:", [(t.text, t.pos_, t.tag_) for t in doc])
    print()


In [None]:
### Discussion / Observations
- Note where tokenizers differ (e.g., emoticons, hashtags, contractions).
- Does stopword removal or lowercasing change tagging correctness? (Try it and report.)


In [None]:
## Part 2 — Customer Service Case Study (toy dataset)
We will:
1. Create a small set of mock transcripts with labels (urgent/non-urgent),
2. POS-tag them,
3. Compute POS tag frequency features and visualize.


In [None]:
# Toy customer service transcripts
transcripts = [
    {"id": 1, "text": "Hi, my internet is down since last night. Please help ASAP!!!", "urgent": 1},
    {"id": 2, "text": "Hello, I'd like to change my billing address at my convenience.", "urgent": 0},
    {"id": 3, "text": "I can't access my account and I need it for work today.", "urgent": 1},
    {"id": 4, "text": "Where can I find your return policy? Thanks!", "urgent": 0},
    {"id": 5, "text": "Server is down, business critical. Please escalate.", "urgent": 1}
]

df = pd.DataFrame(transcripts)
df


In [None]:
# Tag all transcripts using SpaCy and NLTK
def pos_counts_spacy(text):
    doc = nlp(text)
    counts = Counter([token.pos_ for token in doc])
    return counts

def pos_counts_nltk(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    univ = [map_tag('en-ptb', 'universal', t) for _, t in tags]
    return Counter(univ)

df['spacy_pos_counts'] = df['text'].apply(pos_counts_spacy)
df['nltk_pos_counts'] = df['text'].apply(pos_counts_nltk)
df


In [None]:
# Convert counters to a DataFrame of frequencies for plotting
def counters_to_df(counter_series):
    rows = []
    for i, c in enumerate(counter_series, start=1):
        r = dict(c)
        r['id'] = i
        rows.append(r)
    return pd.DataFrame(rows).set_index('id').fillna(0).astype(int)

spacy_counts_df = counters_to_df(df['spacy_pos_counts'])
nltk_counts_df = counters_to_df(df['nltk_pos_counts'])

print("SpaCy POS counts (per transcript):")
display(spacy_counts_df)
print("NLTK (Universal) POS counts (per transcript):")
display(nltk_counts_df)


In [None]:
# Visualization: POS tag distribution (SpaCy)
plt.figure(figsize=(10,5))
spacy_counts_df.sum().sort_values(ascending=False).plot(kind='bar')
plt.title("Aggregate POS Frequency (SpaCy) — Transcripts")
plt.xlabel("POS tag")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Plot side-by-side comparison (SpaCy vs NLTK universal mapping) for totals
total_spacy = spacy_counts_df.sum()
total_nltk = nltk_counts_df.sum()

cmp_df = pd.DataFrame({'SpaCy': total_spacy, 'NLTK': total_nltk}).fillna(0)
cmp_df.plot(kind='bar', figsize=(12,5))
plt.title("POS Frequency Comparison: SpaCy vs NLTK (Universal)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
# Quick feature: compute verb + AUX + ADV counts and compare urgent vs non-urgent
def verb_adv_score(counter):
    # SpaCy POS tags: VERB, AUX, ADV
    return counter.get('VERB',0) + counter.get('AUX',0) + counter.get('ADV',0)

df['spacy_verb_adv_score'] = df['spacy_pos_counts'].apply(verb_adv_score)

plt.figure(figsize=(8,4))
sns.barplot(x='id', y='spacy_verb_adv_score', hue='urgent', data=df)
plt.title("Verb/Aux/Adv score per transcript (higher often indicates urgency)")
plt.xlabel("Transcript ID")
plt.ylabel("Verb/Aux/Adv count")
plt.legend(title='Urgent')
plt.tight_layout()
plt.show()
