### Text Preprocessing Pipleline
**Objective**
To develop a text preprocessing pipeline that :
- Tokenize the input text
- Remove stopwords
- Converts text to lowercase
- Applies either stemming or lemmatization
- Returns the cleaned text ready for analysis


In [None]:
import nltk
print(nltk.data.path)


In [None]:
import nltk
nltk.data.path.append('/home/wanyua/nltk_data')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [None]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Define preprocessing pipeline
def preprocess_text(text, use_lemmatization=True):
    # tokenization
    tokens = word_tokenize(text)

    # stopremoval
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Case conversion
    tokens = [tokens.lower() for token in tokens]

    # stemming or lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(tokens) for token in tokens]

    else: 
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [None]:
# Test the pipeline
text = "The quick brown foxes are jumping over the lazy dogs!"
print(f"With Lemmatization: {preprocess_text(text, use_lemmatization=True)}")
print(f"With Stemming: {preprocess_text(text, use_lemmatization=False)}")