# 02 â€” Text Preprocessing Pipeline

This notebook mirrors **notes/02-text-preprocessing.md** and builds a small pipeline.

In [None]:
# !pip install nltk scikit-learn
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import word_tokenize
# nltk.download("punkt"); nltk.download("stopwords"); nltk.download("wordnet")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
corpus = [
    "Hey, buddy, I want to go to your house?",
    "WIN big $$$ now!!! Limited-time offer.",
    "Reminder: Your meeting is at 3:30pm.",
    "I do not like this product at all.",
]

In [None]:
def normalize(text):
    text = text.lower()
    text = re.sub(r"https?://\S+|www\.\S+", " <URL> ", text)
    text = re.sub(r"\S+@\S+", " <EMAIL> ", text)
    return re.sub(r"\s+", " ", text).strip()

In [None]:
stop = set(stopwords.words("english")); negators={"not","no","never","n't"}
porter=PorterStemmer(); lemma=WordNetLemmatizer()

In [None]:
def preprocess_tokens(text, remove_stopwords=True, stem=False, lemmatize=False):
    t = normalize(text)
    toks = word_tokenize(t)
    out=[]
    for tok in toks:
        if remove_stopwords and tok in stop and tok not in negators: continue
        if stem: tok = porter.stem(tok)
        if lemmatize: tok = lemma.lemmatize(tok)
        out.append(tok)
    return out

for s in corpus:
    print(s, "->", preprocess_tokens(s))

In [None]:
joined = [" ".join(preprocess_tokens(s)) for s in corpus]
cv = CountVectorizer(ngram_range=(1,2)); X = cv.fit_transform(joined); X.shape

In [None]:
tf = TfidfVectorizer(ngram_range=(1,2)); Xtf = tf.fit_transform(joined); Xtf.shape