In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")
print("‚úì spaCy loaded successfully!")

‚úì spaCy loaded successfully!


In [None]:
doc = ["OMG!!! Just tried the new coffee at @StarCafe ‚òï It was AMAZING!!! üòç #BestCoffee #MorningVibes", "The product quality was excellent, but the shipping took way too long. Customer service wasn't very helpful either.", "Scientists Discover New Treatment for Alzheimer's Disease in Breakthrough Study"]

## Basic Text Preprocessing Function

This function performs basic text cleaning using spaCy:
- Converts text to lowercase
- Removes stop words (common words like "the", "is", "a")
- Removes punctuation
- Lemmatizes words (converts to base form)
- Keeps only alphabetic tokens

In [None]:
def preprocess_text(text):
    """
    Preprocess text using spaCy.
    Args:
        text (str): Raw text to process
    Returns:
        list: List of cleaned lemmas (lowercase, no stop words, no punctuation)
    """
    # Process text with spaCy
    doc = nlp(text)
    # Extract cleaned lemmas
    cleaned = [
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]

    return cleaned

In [None]:
for text in doc:
    print("Social Media Text:")
    print(f"Original: {text}")
    print(f"Cleaned: {preprocess_text(text)}")
    print()

## Advanced Text Preprocessing Function

This function offers more flexibility with configurable preprocessing options:

**Parameters:**
- `remove_stop`: Toggle stop word removal
- `use_lemma`: Choose between lemmatized or original tokens
- `lowercase`: Control case normalization
- `alpha_only`: Control whether to keep only alphabetic characters

This allows for customized preprocessing based on your specific NLP task requirements.

In [None]:
def preprocess_text_advanced(text, remove_stop=True, use_lemma=True, lowercase=True, alpha_only=True):
    """
    Advanced preprocessing with configurable options.

    Args:
        text (str): Raw text to process
        remove_stop (bool): Remove stop words if True
        use_lemma (bool): Use lemmas if True, original text if False
        lowercase (bool): Convert to lowercase if True
        alpha_only (bool): Keep only alphabetic tokens if True

    Returns:
        list: List of processed tokens
    """
    doc = nlp(text)

    cleaned = []
    for token in doc:
        # Apply filters
        if token.is_punct:
            continue
        if remove_stop and token.is_stop:
            continue
        if alpha_only and not token.is_alpha:
            continue

        # Choose lemma or original text
        word = token.lemma_ if use_lemma else token.text

        # Apply lowercase
        if lowercase:
            word = word.lower()

        cleaned.append(word)

    return cleaned

Text from Social Media

1.
2.
3.
4.
5.
6.
7.
8.
9.
10. 

In [None]:
for text in doc:
    print("Original:", text)
    print("\nDefault (all filters):", preprocess_text_advanced(text))
    print("Keep stop words:", preprocess_text_advanced(text, remove_stop=False))
    print("Original tokens (no lemma):", preprocess_text_advanced(text, use_lemma=False))
    print("Keep numbers:", preprocess_text_advanced(text, alpha_only=False))