### Using NLP for Text Data Quality
**Objective**: Enhance text data quality using NLP techniques.

**Task**: Spelling Corrections

**Steps**:
1. Data Set: Import a dataset containing text reviews with spelling errors.
2. Apply Corrections: Use a spell-checker from an NLP library to correct spelling mistakes.
3. Verify Improvements: Review the corrections to ensure data quality improvement.

In [1]:
import pandas as pd
from spellchecker import SpellChecker

# Step 1: Import a dataset containing text reviews with spelling errors.
# Generate sample reviews with spelling errors
data = {
    'Review': [
        "The produckt was exellent and the delivry was fastt.",
        "I am very satasfied with th service and qality.",
        "The book was intersting but had a few typoess.",
        "Great experince! I wuld recomend it to evryone.",
        "The softwere is easy to us and very effecient.",
        "The restaurent had amzing food and a gret ambianse.",
        "I bought a new computr and it works perfictly.",
        "The cloting material is very comfotable and durrable.",
        "The teachr was very helpfull and knowledgable.",
        "Overall, it was a pleasent expeirence."
    ]
}
df = pd.DataFrame(data)

print("Original DataFrame with Spelling Errors:")
print(df)

# Step 2: Apply Corrections: Use a spell-checker from an NLP library to correct spelling mistakes.
spell = SpellChecker()

def correct_spellings(text):
    words = text.split()
    corrected_words = []
    for word in words:
        corrected_word = spell.correction(word)
        if corrected_word is not None:
            corrected_words.append(corrected_word)
        else:
            corrected_words.append(word) # Keep the original word if no correction found
    return " ".join(corrected_words)

df['Corrected_Review'] = df['Review'].apply(correct_spellings)

print("\nDataFrame after Spelling Corrections:")
print(df)

# Step 3: Verify Improvements: Review the corrections to ensure data quality improvement.
print("\nComparison of Original and Corrected Reviews:")
for index, row in df.iterrows():
    print(f"Original:  {row['Review']}")
    print(f"Corrected: {row['Corrected_Review']}")
    print("-" * 30)

# Further analysis: Identifying words that were corrected
def identify_corrected_words(original_text, corrected_text):
    original_words = original_text.split()
    corrected_words = corrected_text.split()
    corrections = {}
    for original, corrected in zip(original_words, corrected_words):
        if original != corrected:
            corrections[original] = corrected
    return corrections

df['Corrections'] = df.apply(lambda row: identify_corrected_words(row['Review'], row['Corrected_Review']), axis=1)
print("\nWords that were Corrected:")
print(df[['Review', 'Corrections']])

ModuleNotFoundError: No module named 'spellchecker'