<a href="https://colab.research.google.com/github/srish231/AI-roadmap/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# ------------------- Download Required Resources -------------------
nltk.download('punkt')
nltk.download('stopwords')

# ------------------- Setup -------------------
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

# ------------------- Sample Data -------------------
data = {
    'text': [
        "Hello! 😊 I am applying for a home loan.",
        "Visit https://loans.example.com to check eligibility!!!",
        "I need help with EMI payment issues. <br>",
        "My PAN is ABCDE1234F and Aadhaar is 1234-5678-9012",
        "Why was my loan rejected? Please explain    ."
    ]
}

df = pd.DataFrame(data)

# ------------------- Text Cleaning -------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # remove URLs
    text = re.sub(r'<.*?>', '', text)                  # remove HTML tags
    text = re.sub(r'[^\x00-\x7F]+', '', text)          # remove emojis / non-ASCII
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()           # remove extra spaces
    return text

# ------------------- Preprocessing with Logging -------------------
def preprocess_steps(text):
    steps = {}

    # Step 1: Cleaning
    cleaned = clean_text(text)
    steps['cleaned_text'] = cleaned
    print(f"\nOriginal Text:\n{text}")
    print(f"Cleaned Text:\n{cleaned}")

    # Step 2: Tokenization
    tokens = word_tokenize(cleaned)
    steps['tokens'] = tokens
    print(f"Tokens:\n{tokens}")

    # Step 3: Stopword Removal
    filtered = [word for word in tokens if word not in stop_words]
    steps['no_stopwords'] = filtered
    print(f"After Stopword Removal:\n{filtered}")

    # Step 4: Lemmatization
    doc = nlp(" ".join(filtered))
    lemmatized = [token.lemma_ for token in doc]
    steps['lemmatized'] = lemmatized
    print(f"Lemmatized Tokens:\n{lemmatized}")

    return steps['lemmatized']  # returning only final output to store in df

# ------------------- Apply to DataFrame -------------------
df['cleaned_tokens'] = df['text'].apply(preprocess_steps)

# ------------------- Final Output -------------------
print("\n\n--- Final DataFrame ---")
print(df[['text', 'cleaned_tokens']])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



Original Text:
Hello! 😊 I am applying for a home loan.
Cleaned Text:
hello i am applying for a home loan
Tokens:
['hello', 'i', 'am', 'applying', 'for', 'a', 'home', 'loan']
After Stopword Removal:
['hello', 'applying', 'home', 'loan']
Lemmatized Tokens:
['hello', 'apply', 'home', 'loan']

Original Text:
Visit https://loans.example.com to check eligibility!!!
Cleaned Text:
visit to check eligibility
Tokens:
['visit', 'to', 'check', 'eligibility']
After Stopword Removal:
['visit', 'check', 'eligibility']
Lemmatized Tokens:
['visit', 'check', 'eligibility']

Original Text:
I need help with EMI payment issues. <br>
Cleaned Text:
i need help with emi payment issues
Tokens:
['i', 'need', 'help', 'with', 'emi', 'payment', 'issues']
After Stopword Removal:
['need', 'help', 'emi', 'payment', 'issues']
Lemmatized Tokens:
['need', 'help', 'emi', 'payment', 'issue']

Original Text:
My PAN is ABCDE1234F and Aadhaar is 1234-5678-9012
Cleaned Text:
my pan is abcde1234f and aadhaar is 123456789012
T