### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Removing Stopwords

**Steps**:
1. Data Set: Use a dataset of text product descriptions.
2. Stopword Removal: Utilize an NLP library (e.g., NLTK) to remove stopwords from the
descriptions.
3. Assess Impact: Examine the effectiveness by analyzing word frequency before and after
removal.

In [None]:
# write your code from here


In [1]:
# Step 1: Import necessary libraries
import nltk
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string

# Step 2: Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Sample product descriptions dataset (you can replace this with your own CSV or dataset)
data = {
    'product_id': [1, 2, 3],
    'description': [
        "This is a high-quality wooden table perfect for dining rooms.",
        "A stylish and modern chair designed for comfort and elegance.",
        "Durable stainless steel water bottle with vacuum insulation."
    ]
}
df = pd.DataFrame(data)

# Step 3: Define stopwords and preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # lowercase and tokenize
    words = [word for word in tokens if word.isalpha()]  # remove punctuation
    return words

def remove_stopwords(words):
    return [word for word in words if word not in stop_words]

# Step 4: Analyze word frequencies before and after stopword removal
all_words_before = []
all_words_after = []

for desc in df['description']:
    words = preprocess_text(desc)
    all_words_before.extend(words)
    filtered_words = remove_stopwords(words)
    all_words_after.extend(filtered_words)

# Step 5: Create frequency counters
freq_before = Counter(all_words_before)
freq_after = Counter(all_words_after)

# Step 6: Plotting top 10 most common words
def plot_frequencies(counter, title):
    most_common = counter.most_common(10)
    words, counts = zip(*most_common)
    plt.figure(figsize=(10, 4))
    plt.bar(words, counts, color='skyblue')
    plt.title(title)
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Step 7: Show frequency comparison
plot_frequencies(freq_before, 'Word Frequency Before Stopword Removal')
plot_frequencies(freq_after, 'Word Frequency After Stopword Removal')


ModuleNotFoundError: No module named 'nltk'