### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [None]:
# write your code from here


In [2]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Define sample data safely
def load_data():
    try:
        data = {
            'product_id': [1, 2, 3],
            'description': [
                "This is a high-quality wooden table perfect for dining rooms.",
                "A stylish and modern chair designed for comfort and elegance.",
                "Durable stainless steel water bottle with vacuum insulation."
            ]
        }
        return pd.DataFrame(data)
    except Exception as e:
        print("Error loading data:", e)
        return pd.DataFrame(columns=['product_id', 'description'])

# Preprocessing functions with exception handling
def preprocess_text(text):
    try:
        tokens = word_tokenize(text.lower())
        words = [word for word in tokens if word.isalpha()]  # remove punctuation
        return words
    except Exception as e:
        print("Tokenization error:", e)
        return []

def remove_stopwords(words, stop_words):
    try:
        return [word for word in words if word not in stop_words]
    except Exception as e:
        print("Stopword removal error:", e)
        return words

# Plotting function
def plot_frequencies(counter, title):
    try:
        most_common = counter.most_common(10)
        words, counts = zip(*most_common)
        plt.figure(figsize=(10, 4))
        plt.bar(words, counts, color='skyblue')
        plt.title(title)
        plt.xlabel('Words')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    except Exception as e:
        print("Plotting error:", e)

# Main execution with validation checks
def main():
    df = load_data()
    if df.empty or 'description' not in df.columns:
        print("Invalid or empty dataset.")
        return

    stop_words = set(stopwords.words('english'))
    all_words_before = []
    all_words_after = []

    for desc in df['description']:
        if not isinstance(desc, str):
            continue
        words = preprocess_text(desc)
        all_words_before.extend(words)
        filtered = remove_stopwords(words, stop_words)
        all_words_after.extend(filtered)

    # Frequency analysis
    freq_before = Counter(all_words_before)
    freq_after = Counter(all_words_after)

    plot_frequencies(freq_before, "Word Frequency Before Stopword Removal")
    plot_frequencies(freq_after, "Word Frequency After Stopword Removal")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'nltk'