### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Handling Noisy Text Data

**Steps**:
1. Data Set: Obtain a dataset with customer reviews containing noise (e.g., random characters).
2. Clean Data: Use regex patterns to clean the noise from text data.
3. Evaluate: Compare the text before and after cleaning for noise.

In [1]:
import pandas as pd
import re

# Step 1: Obtain a dataset with customer reviews containing noise (e.g., random characters).
# Generate sample noisy customer reviews
data = {
    'Review': [
        "This product is gr8! I luv it!! #happycustomer",
        "It was okay... but had some issues. 😠 1/5 stars.",
        "The service was terribl3. Never again! %$#@!",
        "Absolutely fantastic! Highly recommended. 👍💯",
        "Not bad, could be better. 🤔 ...",
        "This is the b3st thing I've ever bought! ✨",
        "The delivery was so slowwww. 🐌",
        "What a waste of m0ney! 😡",
        "Excellent quality and fast shipping. 😊🚀",
        "It's alright, nothing special. 🤷‍♀️"
    ]
}
df = pd.DataFrame(data)

# Introduce more noise (random characters)
def add_noise(text, noise_level=0.1):
    noisy_text = ""
    for char in text:
        if np.random.rand() < noise_level:
            noisy_text += np.random.choice(['#', '$', '%', '&', '*', '!', '?', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
        noisy_text += char
    return noisy_text

np.random.seed(42)
df['Noisy_Review'] = df['Review'].apply(add_noise)
print("Original DataFrame with Noisy Reviews:")
print(df)

# Step 2: Clean Data: Use regex patterns to clean the noise from text data.
def clean_text(text):
    # Remove special characters (keeping spaces and alphanumeric)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Cleaned_Review'] = df['Noisy_Review'].apply(clean_text)

# More targeted cleaning (optional, based on the type of noise)
def more_targeted_clean(text):
    # Remove repeating characters (e.g., slowwww -> slow)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Correct some common abbreviations (you might need a more extensive dictionary)
    text = re.sub(r'gr8', 'great', text, flags=re.IGNORECASE)
    text = re.sub(r'luv', 'love', text, flags=re.IGNORECASE)
    text = re.sub(r'b3st', 'best', text, flags=re.IGNORECASE)
    return text

df['Further_Cleaned_Review'] = df['Cleaned_Review'].apply(more_targeted_clean)

print("\nDataFrame after Cleaning:")
print(df)

# Step 3: Evaluate: Compare the text before and after cleaning for noise.
print("\nComparison of Original, Noisy, and Cleaned Reviews:")
for index, row in df.iterrows():
    print(f"Original: {row['Review']}")
    print(f"Noisy:    {row['Noisy_Review']}")
    print(f"Cleaned:  {row['Further_Cleaned_Review']}")
    print("-" * 30)

NameError: name 'np' is not defined