<a href="https://colab.research.google.com/github/srish231/AI-roadmap/blob/main/Preprocessing%20data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load stopwords and spaCy model
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

# ---------------------- Sample DataFrame ----------------------
data = {
    'text': [
        "Hello! 😊 I am applying for a home loan.",
        "Visit https://loans.example.com to check eligibility!!!",
        "I need help with EMI payment issues. <br>",
        "My PAN is ABCDE1234F and Aadhaar is 1234-5678-9012",
        "Why was my loan rejected? Please explain    ."
    ]
}

df = pd.DataFrame(data)

# ---------------------- Cleaning Function ----------------------
def clean_text(text):
    text = text.lower()                                                 #lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)                   #remove urls
    text = re.sub(r'<.*?>', '', text)                                   #remove HTML tags
    text = re.sub(r'[^\x00-\x7F]+', '', text)                           #remove emojies/non-ASCII
    text = text.translate(str.maketrans('', '', string.punctuation))    #remove punctuation
    # text = re.sub(r'\d+', '', text)                                     #remove digits
    text = re.sub(r'\s+', ' ', text).strip()                            #Remove extra spaces
    return text

# ---------------------- Full Preprocessing Function ----------------------
def preprocess(text):
    cleaned = clean_text(text)
    tokens = word_tokenize(cleaned)
    filtered = [word for word in tokens if word not in stop_words]
    # stemmed = [stemmer.stem(word) for word in filtered]
    doc = nlp(" ".join(filtered))
    lemmatized = [token.lemma_ for token in doc]
    return lemmatized

# ---------------------- Apply to DataFrame ----------------------
df['cleaned_tokens'] = df['text'].apply(preprocess)

# ---------------------- Result ----------------------
print(df[['text', 'cleaned_tokens']])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                                text  \
0            Hello! 😊 I am applying for a home loan.   
1  Visit https://loans.example.com to check eligi...   
2          I need help with EMI payment issues. <br>   
3  My PAN is ABCDE1234F and Aadhaar is 1234-5678-...   
4      Why was my loan rejected? Please explain    .   

                             cleaned_tokens  
0                [hello, apply, home, loan]  
1               [visit, check, eligibility]  
2         [need, help, emi, payment, issue]  
3  [pan, abcde1234f, aadhaar, 123456789012]  
4           [loan, reject, please, explain]  
