In [None]:
NLP(Chatbot, Voice to Voice bot , Text corpus ->sentiment analysis, chatbot )

LLM -> NLP -> Chatbot (RAG) , sentiment analysis (LLM)

# Text Preprocessing with Hugging Face
---

## 📌 Introduction to Text Preprocessing
Text preprocessing is one of the **first steps in Natural Language Processing (NLP)**. It helps convert messy, raw text into clean and structured data that machines can understand.

Why do we need it?
- Removes noise
- Standardizes text
- Prepares data for tokenization, embeddings, and modeling

## 🔹 What is Tokenization?
Tokenization is the process of **breaking text into smaller units (tokens)** such as words or sentences.

For example:
```
Text: "Hugging Face makes NLP easy."
Tokens: ["Hugging", "Face", "makes", "NLP", "easy", "."]
```

In [None]:
from transformers import AutoTokenizer

# Load a pretrained tokenizer (BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

text = "Hugging Face makes NLP easy."
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

Tokens: ['hugging', 'face', 'makes', 'nl', '##p', 'easy', '.']


In [None]:
# unbeliveable -> Small part |

# Dictionary ()

# un , ##belive , ##able

In [None]:
# word based  , subword token , Special (Cls , Sep , pad , unk , mask)

## 🔹 Breaking Text into Words or Sentences

In [None]:
# Word-level splitting
words = text.split()
print("Words:", words)

Words: ['Hugging', 'Face', 'makes', 'NLP', 'easy.']


In [None]:
# Word-level splitting
words = text.split()
print("Words:", words)

# Sentence-level splitting
import nltk
nltk.download('punkt')
nltk.download('punkt_tab') # Add this line to download the required resource
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize("Hugging Face makes NLP easy. It provides transformers!")
print("Sentences:", sentences)

Words: ['Hugging', 'Face', 'makes', 'NLP', 'easy.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Sentences: ['Hugging Face makes NLP easy.', 'It provides transformers!']


In [None]:
encoding = tokenizer(text, return_offsets_mapping=True)
print("Offsets:", encoding["offset_mapping"])

Offsets: [(0, 0), (0, 7), (8, 12), (13, 18), (19, 21), (21, 22), (23, 27), (27, 28), (0, 0)]


## 🔹 Lemmatization
**Lemmatization** reduces words to their **base or dictionary form (lemma)**.
- "running" → "run"
- "better" → "good"

In [None]:
text  = ran , run , running -> run (Data Variation)

In [None]:
import spacy

# Load SpaCy small model
nlp = spacy.load("en_core_web_sm")

doc = nlp("running runners better cats")
lemmas = [token.lemma_ for token in doc]
print("Lemmas:", lemmas)

Lemmas: ['run', 'runner', 'well', 'cat']


In [None]:
for token in doc:
    print(token.text, "->", token.lemma_, "| POS:", token.pos_)

running -> run | POS: VERB
runners -> runner | POS: NOUN
better -> well | POS: ADJ
cats -> cat | POS: NOUN


## 🔹 Stopwords
**Stopwords** are common words like *is, the, in, on, a* that do not add much meaning to the text.

Why remove them?
- Reduce noise
- Focus on important words

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

words = ["this", "is", "an", "example", "sentence"]
filtered_words = [w for w in words if w.lower() not in stop_words]
print("Without Stopwords:", filtered_words)

Without Stopwords: ['example', 'sentence']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
print("Sample SpaCy Stopwords:", list(spacy_stopwords)[:20])

Sample SpaCy Stopwords: ['should', 'not', 'front', 'fifty', 'of', 'still', 'becoming', 'namely', 'same', 'itself', 'against', 'each', 'per', 'either', 'he', 'off', 'say', 'does', 'towards', 'hundred']


## 🔹 Using Hugging Face Tokenizers with Preprocessing

In [None]:
word -> Embedding -> Vector


King -> embedding -> [1 , 2, 3 ,4] - Man [1, 1, 1, 1] = Queen []



In [None]:
Text -> Vector -> Model -> Output
Text-> Stopword, Lemmatization , Lowercase , Tokenization -> Vector -> DeepLearning -> output (Classification)

In [None]:
encoding = tokenizer(
    text,
    padding=True,
    truncation=True,
    return_tensors="pt"
)
print("Encoding:", encoding)

Encoding: {'input_ids': tensor([[  101, 17662,  2227,  3084, 17953,  2361,  3733,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
Word -> Embedding -> Vector
Sentence -> Embedding -> Vector

In [None]:
input_ids = encoding["input_ids"][0]
decoded_tokens = tokenizer.convert_ids_to_tokens(input_ids)
print("Decoded Tokens:", decoded_tokens)

Decoded Tokens: ['[CLS]', 'hugging', 'face', 'makes', 'nl', '##p', 'easy', '.', '[SEP]']


## 🔹 Putting It All Together
A simple pipeline:
1. **Tokenize text**
2. **Lemmatize tokens**
3. **Remove stopwords**

In [None]:
def preprocess(text):
    # Step 1: Tokenize
    tokens = tokenizer.tokenize(text)

    # Step 2: Lemmatize with SpaCy
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc]

    # Step 3: Remove Stopwords
    clean_text = [word for word in lemmas if word.lower() not in stop_words]
    return clean_text

sample = "Hugging Face is making NLP tasks easier and better."
print("Final Preprocessed Output:", preprocess(sample))

Final Preprocessed Output: ['hug', 'face', 'make', 'nl', '#', '#', 'p', 'task', 'easy', 'well', '.']


In [None]:
sample2 = "NLP is fun. Hugging Face makes it simple!"
print("Preprocessed Output (Multi-Sentence):", preprocess(sample2))

Preprocessed Output (Multi-Sentence): ['nl', '#', '#', 'p', 'fun', '.', 'hug', 'face', 'make', 'simple', '!']


## 🔹 Use Case: Sentiment Analysis with IMDB Dataset
Now let’s apply preprocessing to a **real-world dataset**: IMDB Movie Reviews.

We will:
1. Load the dataset from Hugging Face `datasets`
2. Preprocess the text (tokenization, lemmatization, stopword removal)
3. Convert text into embeddings
4. Train a classification model

In [None]:
from datasets import load_dataset

# Load IMDB dataset
dataset = load_dataset("imdb")
print(dataset)

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


In [None]:
print("First 5 rows of the 'train' split:")
for i in range(5):
    print(dataset["train"][i])

print("\nFirst 5 rows of the 'test' split:")
for i in range(5):
    print(dataset["test"][i])

First 5 rows of the 'train' split:
{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex a

In [None]:
# vector of the text | Label (0,1)     -> Deep Learning Model -> Output

In [None]:
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

print("Train DataFrame:")
display(train_df.head())

print("\nTest DataFrame:")
display(test_df.head())

Train DataFrame:


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0



Test DataFrame:


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [None]:
train_df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,12500
1,12500


### ✅ Preprocess Reviews in DataFrame

In [None]:
def preprocess_text(text):
    tokens = tokenizer.tokenize(text)
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc]
    clean_text = [word for word in lemmas if word.lower() not in stop_words]
    return " ".join(clean_text)

In [None]:
train_df_sample = train_df.sample(10, random_state=42)
test_df_sample = test_df.sample(2, random_state=42)

train_df_sample["clean_text"] = train_df_sample["text"].apply(preprocess_text)
test_df_sample["clean_text"] = test_df_sample["text"].apply(preprocess_text)

train_df_sample.head()

Token indices sequence length is longer than the specified maximum sequence length for this model (806 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,text,label,clean_text
6868,"Dumb is as dumb does, in this thoroughly unint...",0,"dumb dumb , thoroughly un # # int # # ere # # ..."
24016,I dug out from my garage some old musicals and...,1,dig garage old musical another one favorite . ...
9668,After watching this movie I was honestly disap...,0,"watch movie honestly disappointed - actor , st..."
13640,This movie was nominated for best picture but ...,1,movie nominate good picture lose casablanca pa...
14018,Just like Al Gore shook us up with his painful...,1,like al gore shake painfully honest clever # #...



### ✅ Convert Clean Text into Embeddings

In [None]:
from transformers import AutoModel
import torch

bert_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Function to get embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    # Remove token_type_ids as DistilBERT does not use them
    if 'token_type_ids' in inputs:
        del inputs['token_type_ids']
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# clean text -> embedding (Numericial)->output

In [None]:
def get_embedding(text):
    # Tokenize
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Remove token_type_ids if present (DistilBERT doesn’t use them)
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

    # Get hidden states
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Use mean pooling
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    print("size of embedding",len(embedding))
    return embedding

print(get_embedding("The movie was great!"))


size of embedding 768
[ 2.10326865e-01 -3.50154787e-01  7.19090477e-02  1.38738841e-01
  4.27424610e-02 -3.64263326e-01  1.96732506e-01  6.92746162e-01
 -1.79024115e-01  2.81823073e-02  7.51506910e-02 -3.54656488e-01
  1.59635887e-01  5.81415534e-01 -1.52763247e-01  1.43049940e-01
  1.22551993e-01 -1.55438054e-02 -6.91748485e-02 -8.69848877e-02
 -3.48729402e-01 -3.54606360e-01 -7.28379264e-02  5.13721585e-01
  5.59003092e-02 -1.73978675e-02 -5.99146411e-02 -1.64111741e-02
  1.31383285e-01 -2.36258268e-01  2.70584613e-01 -2.06951231e-01
 -8.62380341e-02  9.00608748e-02 -3.92944783e-01 -5.89820631e-02
 -3.86410430e-02 -4.80187535e-02 -4.46627200e-01 -1.86050683e-01
 -2.65019029e-01 -3.68524551e-01  3.84991586e-01 -5.63418306e-02
 -1.73695773e-01 -2.93556541e-01 -7.38042444e-02  6.21422045e-02
  2.54609883e-01 -3.70542668e-02  2.37671018e-01  3.79177541e-01
 -2.59983838e-01 -2.50518750e-02  1.74866706e-01  2.35489726e-01
 -3.01625222e-01 -3.08787972e-01 -4.09045190e-01 -8.27087089e-02
  3

## 🎯 Key Takeaways
- **Tokenization** → Breaks text into words or sentences
- **Lemmatization** → Reduces words to base form
- **Stopwords Removal** → Filters out common words
- Hugging Face + SpaCy + NLTK can be combined for a **powerful preprocessing pipeline**