# Text pre-processing in Python

This notebook demonstrates text preprocessing using Python. It covers basic built-in methods, NLTK, and SpaCy for common preprocessing tasks such as:

- Converting text to lowercase
- Removing punctuation
- Tokenization
- Lemmatization
- Stopword removal

In [None]:
# Install required packages
%pip install nltk spacy

In [None]:
# Example text
text = "This is an Example TEXT with Mixed CASE and Punctuations!!!"
print(text)

## Basic Pre-processing with Python Built-in Methods

In [None]:
lowercase_text = text.lower()
print("Lowercase:", lowercase_text)

# Remove punctuations
import string
no_punctuation = ''.join(char for char in text if char not in string.punctuation)
print("Without Punctuation:", no_punctuation)

# Split into words
words = no_punctuation.split()
print("Words:", words)

## Basic Preprocessing with NLTK

In [None]:
# Import NLTK and download required resources
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("Without Stopwords:", filtered_tokens)

## Basic Preprocessing with Spacy

In [None]:
# Load spaCy
import spacy
spacy.cli.download("en_core_web_sm")  # Download the English model if not already installed
nlp = spacy.load("en_core_web_sm")

# Process the text
doc = nlp(text)

# Lemmatization
lemmatized = [token.lemma_ for token in doc]
print("Lemmatized:", lemmatized)

# Remove stopwords and punctuations
filtered = [token.text for token in doc if not token.is_stop and not token.is_punct]
print("Filtered:", filtered)