### Text Preprocessing Pipleline
**Objective**
To develop a text preprocessing pipeline that :
- Tokenize the input text
- Remove stopwords
- Converts text to lowercase
- Applies either stemming or lemmatization
- Returns the cleaned text ready for analysis


In [None]:
import nltk

from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define preprocessing pipeline
def preprocess_text(text, use_lemmatization=True):
    tokenizer = TreebankWordTokenizer()

    # Tokenization
    tokens = tokenizer.tokenize(text)

    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Case conversion
    tokens = [token.lower() for token in tokens]

    # Stemming or Lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    else: 
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

# Test the pipeline
text = "The quick brown foxes are jumping over the lazy dogs!"
print(f"With Lemmatization: {preprocess_text(text, use_lemmatization=True)}")
print(f"With Stemming: {preprocess_text(text, use_lemmatization=False)}")


With Lemmatization: ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog', '!']
With Stemming: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '!']


[nltk_data] Downloading package punkt to /home/wanyua/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/wanyua/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/wanyua/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
