# Natural Language Processing (NLP) — One command per code cell with purpose
This notebook-style script lists commonly used NLP commands across popular libraries (NLTK, spaCy, TextBlob, scikit-learn, Gensim, Transformers, Sentence-Transformers). Each code cell contains one primary command with an inline comment describing its purpose. Heavy downloads and network I/O are commented.


# Installation (commented) — run in your environment if needed
# - Only install what you need

In [None]:
# !pip install nltk spacy textblob gensim scikit-learn transformers sentence-transformers spacy-lookups-data


# Optional model/corpus downloads (commented)
# - NLTK data: tokenizers, stopwords, wordnet
# - spaCy model: small English model for POS/NER/lemma

In [None]:
# import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet'); nltk.download('averaged_perceptron_tagger'); nltk.download('maxent_ne_chunker'); nltk.download('words')  # comment/uncomment per need
# !python -m spacy download en_core_web_sm  # download spaCy English model (commented)


# Setup: imports and sample texts

In [None]:
import re  # regex utilities for simple text cleanup


In [None]:
from pprint import pprint  # pretty printing helper for small outputs


In [None]:
sample_text = """Natural Language Processing (NLP) enables computers to understand human language.\n
It includes tokenization, stopwords removal, stemming, lemmatization, POS tagging, and NER.\n
Apple is looking at buying U.K. startup for $1 billion.\n
SpaCy and NLTK are common Python libraries for NLP!"""  # small multi-line text for demos


# NLTK — imports

In [None]:
import nltk  # main NLTK package for classic NLP tasks


In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize  # tokenizers for words and sentences


In [None]:
from nltk.corpus import stopwords  # stop words list (requires nltk.download('stopwords'))


In [None]:
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer  # stemming and lemmatization tools


# Tokenization

In [None]:
sent_tokenize(sample_text)  # split text into sentences


In [None]:
word_tokenize(sample_text)  # split text into word tokens (punctuation as separate tokens)


# Stop words

In [None]:
stopwords.words('english')  # get built-in English stop words list


In [None]:
tokens = word_tokenize(sample_text)  # tokenize once to reuse in later cells


In [None]:
{w.lower() for w in tokens if w.isalpha()}  # unique alphabetic tokens lowercased (simple normalization)


In [None]:
{w for w in tokens if w.lower() not in set(stopwords.words('english')) and w.isalpha()}  # remove stop words (toy example)


# Stemming and Lemmatization

In [None]:
PorterStemmer().stem('running')  # reduce word to its stem/root via Porter stemmer


In [None]:
SnowballStemmer('english').stem('studies')  # stem using Snowball (aka Porter2) stemmer


In [None]:
WordNetLemmatizer().lemmatize('better', pos='a')  # lemmatize (needs WordNet; pos helps with correct lemma)


In [None]:
[PorterStemmer().stem(w) for w in tokens if w.isalpha()]  # stem a list of tokens (demo list comprehension)


In [None]:
[WordNetLemmatizer().lemmatize(w) for w in tokens if w.isalpha()]  # lemmatize a list of tokens


# N-grams and POS/NER

In [None]:
list(nltk.ngrams([w.lower() for w in tokens if w.isalpha()], 2))  # generate bigrams over alphabetic tokens


In [None]:
nltk.pos_tag(tokens)  # part-of-speech tagging for tokens (requires tagger data)


In [None]:
nltk.ne_chunk(nltk.pos_tag(word_tokenize("Apple is buying a startup in U.K.")))  # NLTK named entity chunking (tree output)


# Text normalization helpers (regex)

In [None]:
re.sub(r"[^\w\s]", "", sample_text.lower())  # lowercase and strip punctuation (very simple cleaning)


# TextBlob — quick sentiment, tokens, noun phrases

In [None]:
from textblob import TextBlob  # high-level text processing library


In [None]:
TextBlob(sample_text).sentiment  # polarity (-1..1) and subjectivity (0..1)


In [None]:
TextBlob("I absolutely love natural language processing!").sentences  # sentence objects from text


In [None]:
TextBlob("Cats are running and ate fishes").correct()  # spelling correction (toy) — may be slow on large text


In [None]:
TextBlob("New York City is great.").noun_phrases  # noun phrase extraction


In [None]:
# TextBlob("Bonjour le monde").translate(to='en')  # translation (uses web APIs; commented to avoid network)


# spaCy — models and processing
# Note: Accurate POS/NER requires a trained model (e.g., en_core_web_sm). We'll show a blank pipeline as safe default.

In [None]:
import spacy  # industrial-strength NLP library


In [None]:
nlp = spacy.blank('en')  # create a blank English pipeline (no trained components)


In [None]:
# nlp = spacy.load('en_core_web_sm')  # load small English model for POS/NER/lemma (uncomment after installing)


In [None]:
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")  # process text to a Doc object


In [None]:
[t.text for t in doc]  # token texts from spaCy Doc


In [None]:
getattr(doc[0], 'lemma_', '')  # access lemma of a token (empty if component not available)


In [None]:
[(ent.text, ent.label_) for ent in doc.ents]  # named entities (empty for blank model)


In [None]:
[(t.text, t.pos_) for t in doc]  # part-of-speech tags (empty for blank model)


In [None]:
[chunk.text for chunk in getattr(doc, 'noun_chunks', [])]  # noun chunks (requires parser in loaded model)


# scikit-learn — Bag of Words (CountVectorizer) and TF-IDF (TfidfVectorizer)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # classic vectorizers for text


In [None]:
corpus = [
    "NLP enables computers to understand language",
    "Language models perform tokenization and lemmatization",
    "Named entity recognition is a common NLP task",
]  # tiny corpus of 3 documents


In [None]:
CountVectorizer(max_features=20).fit_transform(corpus).toarray()  # bag-of-words document-term matrix


In [None]:
TfidfVectorizer(ngram_range=(1,2), stop_words='english').fit_transform(corpus).toarray()  # TF-IDF with unigrams+bigrams


In [None]:
CountVectorizer(min_df=1, max_df=1.0, binary=True).fit(corpus).get_feature_names_out()  # learned vocabulary terms


In [None]:
tfidf = TfidfVectorizer().fit_transform(corpus)  # compute TF-IDF matrix (sparse)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity  # pairwise similarity for vectors


In [None]:
cosine_similarity(tfidf)  # document-to-document cosine similarity matrix using TF-IDF features


# Similarity search (scikit-learn) — NearestNeighbors with cosine distance

In [None]:
from sklearn.neighbors import NearestNeighbors  # k-NN search over vector spaces


In [None]:
nn = NearestNeighbors(metric='cosine', n_neighbors=2).fit(tfidf)  # fit nearest neighbors index on TF-IDF vectors


In [None]:
nn.kneighbors(TfidfVectorizer().fit_transform(["NLP and language processing"]))  # find nearest docs to a query


# Gensim — Word2Vec training and usage

In [None]:
from gensim.models import Word2Vec  # Word2Vec implementation (CBOW/Skip-gram)


In [None]:
toy_sentences = [
    ["nlp", "enables", "computers", "understand", "language"],
    ["language", "models", "perform", "tokenization"],
    ["named", "entity", "recognition", "is", "nlp"],
]  # toy tokenized sentences


In [None]:
w2v = Word2Vec(toy_sentences, vector_size=50, window=2, min_count=1, workers=1, sg=1, seed=42)  # train a tiny Word2Vec model


In [None]:
w2v.wv.most_similar("nlp")  # query most similar words to a given token in the trained space


# GloVe vectors (pretrained) — loading with Gensim KeyedVectors (commented for safety)

In [None]:
# from gensim.models import KeyedVectors
# # Convert GloVe to word2vec format once, or use ready-made files
# # kv = KeyedVectors.load_word2vec_format('glove.6B.50d.word2vec.txt', binary=False)  # path required
# # kv.most_similar('king')  # example lookup (commented)


# Transformers — tokenizers and embeddings (commented to avoid downloads)

In [None]:
# from transformers import AutoTokenizer, AutoModel
# tok = AutoTokenizer.from_pretrained('distilbert-base-uncased')  # load tokenizer (downloads model files)


In [None]:
# mdl = AutoModel.from_pretrained('distilbert-base-uncased')  # load transformer model (downloads weights)


In [None]:
# enc = tok("NLP with BERT embeddings", return_tensors='pt')  # tokenize text to model inputs (PyTorch tensors)


In [None]:
# mdl(**enc).last_hidden_state.mean(dim=1)  # pooled embedding by averaging last hidden states (toy example)


# Transformers — feature extraction pipeline (commented)

In [None]:
# from transformers import pipeline
# fe = pipeline('feature-extraction', model='distilbert-base-uncased')  # build feature extraction pipeline


In [None]:
# fe("Semantic embeddings are useful for search")  # extract per-token embeddings (list of vectors)


# Sentence-Transformers — sentence embeddings and semantic similarity
# Note: Downloads a small model on first use; uncomment to run.

In [None]:
# from sentence_transformers import SentenceTransformer, util
# sbert = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')  # compact, fast sentence embedding model


In [None]:
# emb = sbert.encode(["NLP is great", "I love language processing", "Cats and dogs"], normalize_embeddings=True)  # encode texts


In [None]:
# util.cos_sim(emb, emb)  # cosine similarity matrix between sentences


In [None]:
# util.semantic_search(emb[0:1], emb, top_k=2)  # semantic search: top-2 sentences most similar to the first


# Named Entity Recognition (spaCy model) — accurate (commented; requires model)

In [None]:
# nlp_sm = spacy.load('en_core_web_sm')  # load small English model


In [None]:
# [(ent.text, ent.label_) for ent in nlp_sm("Apple is buying a U.K. startup for $1 billion").ents]  # extract entities


# End-to-end pattern: Preprocess → Vectorize → Similarity (tiny example)

In [None]:
def normalize(txt: str) -> str:
    return re.sub(r"\s+", " ", re.sub(r"[^\w\s]", "", txt.lower())).strip()  # simple lowercase + punctuation strip


In [None]:
norm_corpus = [normalize(d) for d in corpus]  # apply simple normalization to corpus


In [None]:
tfidf2 = TfidfVectorizer().fit_transform(norm_corpus)  # build TF-IDF on normalized corpus


In [None]:
cosine_similarity(tfidf2, TfidfVectorizer().fit_transform([normalize("NLP language models")] ))  # similarity of query to corpus


# Notes
1. Many commands require prior downloads (NLTK corpora, spaCy models, Transformers weights). These are commented to keep this file runnable by default.
2. For high-quality POS/NER, prefer `spacy.load('en_core_web_sm')` (or larger) over a blank pipeline.
3. Use `NearestNeighbors(metric='cosine')` or FAISS/Annoy for scalable semantic search over embeddings.
4. Replace toy corpora and examples with your datasets as you study.
