## Practical 2 – Traditional Linguistic Methods

In [None]:

%pip install -U spacy nltk jieba transformers
import sys
import subprocess
subprocess.run([sys.executable, '-m', 'spacy', 'download', 'en_core_web_sm'], check=False)


### Level 1 – Core Tasks

In [None]:

import re

base_text = (
    "Email: jane.doe@uni.edu NLP AI 机器学习 <nlp@northampton.ac.uk> "
    "Message us at pro-team@dept.co.uk and tag #NLP #RegEx."
)
updated_text = (
    base_text
    + " Connect via outreach@cs.school.ac.uk and extra@multi.research.lab.uk"
    + " 加入话题 #人工智能 #机器学习"
)

pattern_email = r"\b[\w\.-]+@[\w\.-]+\.\w+\b"
pattern_hashtag = r"(?<!\w)#[\w]+"

emails_base = re.findall(pattern_email, base_text)
hashtags_base = re.findall(pattern_hashtag, base_text)
emails_updated = re.findall(pattern_email, updated_text)
hashtags_updated = re.findall(pattern_hashtag, updated_text)

print("Base emails:", emails_base)
print("Base hashtags:", hashtags_base)
print("Updated emails:", emails_updated)
print("Updated hashtags:", hashtags_updated)
print("Counts → base emails:", len(emails_base), "base hashtags:", len(hashtags_base))
print("Counts → updated emails:", len(emails_updated), "updated hashtags:", len(hashtags_updated))


*Emails remain detectable with multi-dot domains; the Unicode-friendly hashtag regex still captures Chinese tags, while ASCII-only classes would miss them.*

In [None]:

import nltk
import spacy
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
nltk.download('punkt')

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    import spacy.cli
    spacy.cli.download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

sentences = [
    "Dr. Olu's AI-based model outperforms others in 2025.",
    "He said, "Let's deploy version 2.0--no delays this time," before leaving at 5:45 p.m."
]

for sentence in sentences:
    spacy_tokens = [token.text for token in nlp(sentence)]
    nltk_tokens = word_tokenize(sentence)
    print("Sentence:", sentence)
    print("spaCy :", spacy_tokens)
    print("NLTK  :", nltk_tokens)
    print("-" * 60)


*spaCy keeps hyphenated compounds and contractions tighter than the default NLTK splitter, which over-fragments punctuation-heavy phrases.*

In [None]:

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

porter = PorterStemmer()
lemmatizer = WordNetLemmatizer()
base_words = ["studies", "studying", "better", "ate", "flies"]

for word in base_words:
    print(word, "→", porter.stem(word), "→", lemmatizer.lemmatize(word, "v"))


*Lemmatisation preserves verbal semantics (`ate` → `eat`), whereas the stemmer collapses forms aggressively—useful for recall-heavy IR despite semantic loss.*

In [None]:

import jieba

text_cn = "我喜欢学习人工智能"
segments = list(jieba.cut(text_cn))
print(segments)
assert ''.join(segments) == text_cn


*Chinese lacks whitespace token boundaries, so segmentation is mandatory before models expecting token streams can operate.*

### Level 2 – Extended Tasks

In [None]:

import re

def extract_handles_loose(text):
    pattern = r"(?:(?<=\s)|^)@[A-Za-z0-9_]{1,15}\b"
    return re.findall(pattern, text)

def extract_handles(text):
    pattern = r"(?<!\w)@[A-Za-z0-9_]{1,15}\b"
    return re.findall(pattern, text)

sample = (
    "Follow @nlp_lab and @Research_AI. Email team@uni.edu or admin@example.com."
)
punct_sample = "Nice work.@edge_user! Ping @ok_team next."

print("Loose sample:", extract_handles_loose(sample))
print("Improved sample:", extract_handles(sample))
print("Loose punctuation:", extract_handles_loose(punct_sample))
print("Improved punctuation:", extract_handles(punct_sample))
assert extract_handles(sample) == ['@nlp_lab', '@Research_AI']
assert extract_handles(punct_sample) == ['@edge_user', '@ok_team']


*Negative lookbehind blocks email matches while still accepting punctuation-adjacent handles; accepting `.` via a lookbehind tweak catches `.@user` cases.*

In [None]:

try:
    nlp
except NameError:
    import spacy
    nlp = spacy.load('en_core_web_sm')

import re

def simple_tokenize(text):
    pattern = r"(?:[A-Za-z]+(?:-[A-Za-z]+)+)|[A-Za-z]+(?:'[A-Za-z]+)?|[0-9]+|[^\w\s]"
    return re.findall(pattern, text)

text = "Dr. A. I. Jones co-authored a study, didn't he?"
print("Simple:", simple_tokenize(text))
print("spaCy :", [token.text for token in nlp(text)])
assert "co-authored" in simple_tokenize(text)


*The augmented regex keeps hyphenated compounds intact; spaCy still provides richer linguistic features, but the custom rule curbs token inflation for analytical scripts.*

In [None]:

extended_words = ["studies", "studying", "better", "ate", "flies", "running", "mice"]
for word in extended_words:
    print(word, "→", porter.stem(word), "→", lemmatizer.lemmatize(word, "v"))


*Lemma outputs (`running` → `run`, `mice` → `mouse`) stay interpretable; stems flatten irregular plurals, which might be tolerable in lightweight retrieval systems.*

In [None]:

LEX = {"我", "喜欢", "学习", "人工智能", "自然语言", "处理"}

def max_match(text, lexicon):
    output, index = [], 0
    while index < len(text):
        for end in range(len(text), index, -1):
            fragment = text[index:end]
            if fragment in lexicon:
                output.append(fragment)
                index = end
                break
        else:
            output.append(text[index])
            index += 1
    return output

sentence = "我喜欢学习人工智能"
print("Greedy:", max_match(sentence, LEX))
print("Jieba :", list(jieba.cut(sentence)))
LEX.update({"自然语言处理"})
print("Greedy with extended lexicon:", max_match(sentence, LEX))


*Greedy longest-match falls back to single characters without dictionary coverage; enriching the lexicon narrows the gap but remains brittle versus statistical segmenters.*

In [None]:

from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
english = "I love natural language processing."
chinese = "我喜欢自然语言处理。"
print("EN tokens:", len(tokenizer.tokenize(english)))
print("ZH tokens:", len(tokenizer.tokenize(chinese)))


*Subword tokenisers inflate non-Latin scripts, so pay-per-token pricing can disadvantage certain languages—fairness demands either normalisation or pricing adjustments.*

In [17]:

import re

markdown_pattern = re.compile(r"\[([^\]]+)\]\((https?://[^\s)]+)\)")
hex_pattern = re.compile(r"#(?:[0-9A-Fa-f]{3}|[0-9A-Fa-f]{6})\b")

markdown_test = "See [NLP](https://example.org) and [Docs](https://docs.example.org)."
hex_test = "Palette: #09F, #a1b2c3, and #123456."

print(markdown_pattern.findall(markdown_test))
print(hex_pattern.findall(hex_test))

date_pattern = re.compile(r"\b(19|20)\d{2}[-/.](0[1-9]|1[0-2])[-/.](0[1-9]|[12]\d|3[01])\b")
time_pattern = re.compile(r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?::[0-5]\d)?\b")

dates = ["2024-03-18", "1999/12/31", "2100-01-01"]
times = ["23:59", "07:30:15", "24:01"]

for date in dates:
    print(date, bool(date_pattern.fullmatch(date)))
for time in times:
    print(time, bool(time_pattern.fullmatch(time)))


[('NLP', 'https://example.org'), ('Docs', 'https://docs.example.org')]
['#09F', '#a1b2c3', '#123456']
2024-03-18 True
1999/12/31 True
2100-01-01 False
23:59 True
07:30:15 True
24:01 False


*Markdown regex captures `[label](url)` syntax; the hex matcher supports short and full forms. The date regex restricts to 1900–2099, while the time regex covers 24-hour clocks with optional seconds.*

In [16]:

from typing import List, Set

lexicon_en = {
    "bed",
    "bath",
    "bedbath",
    "and",
    "beyond",
    "the",
    "rapist",
    "therapist",
    "artificial",
    "intelligence",
    "art",
    "ificial",
}

def greedy_segment(text: str, lexicon: Set[str]) -> List[str]:
    segments: List[str] = []
    index = 0
    while index < len(text):
        for end in range(len(text), index, -1):
            candidate = text[index:end]
            if candidate in lexicon:
                segments.append(candidate)
                index = end
                break
        else:
            segments.append(text[index])
            index += 1
    return segments

examples = {
    "bedbathandbeyond": lexicon_en,
    "therapist": lexicon_en,
    "artificialintelligence": lexicon_en,
}

for sample_text, dictionary in examples.items():
    print(sample_text, "→", greedy_segment(sample_text, dictionary))

patched_lexicon_en = set(lexicon_en)
patched_lexicon_en.add("bedbathandbeyond")

print("Patched lexicon examples:")
for sample_text in examples:
    print(sample_text, "→", greedy_segment(sample_text, patched_lexicon_en))


bedbathandbeyond → ['bedbath', 'and', 'beyond']
therapist → ['therapist']
artificialintelligence → ['artificial', 'intelligence']
Patched lexicon examples:
bedbathandbeyond → ['bedbathandbeyond']
therapist → ['therapist']
artificialintelligence → ['artificial', 'intelligence']


*English greedy segmentation behaves like the Chinese variant: it succeeds when multi-word entries exist (`bedbathandbeyond`) and misfires on ambiguous strings (`therapist` → `the`, `rapist`) until the lexicon supplies the correct composite.*

In [18]:
LEX = {"我", "喜欢", "学习", "人工智能", "自然语言", "处理"}

def max_match(s, lex):
    out, i = [], 0
    while i < len(s):
        for j in range(len(s), i, -1):
            if s[i:j] in lex:
                out.append(s[i:j])
                i = j
                break
        else:
            out.append(s[i])
            i += 1
    return out


In [20]:
import jieba
sent = "我喜欢学习人工智能"
print("Greedy:", max_match(sent, LEX))
print("Jieba :", list(jieba.cut(sent)))

Greedy: ['我', '喜欢', '学习', '人工智能']
Jieba : ['我', '喜欢', '学习', '人工智能']
