# DATA PREPARATION 

In [2]:
import pandas as pd

csv_path = "../data_preparation/outputs/version_251130/processed.csv"
columns = [
    "commenter_id", "comment_id", "parent_id", "post_id", "comment_content",
    "cleaned_content_LSA", "cleaned_content_LIWC", "num_emojis", "num_text_emojis",
    "num_caps_words", "num_unicode_chars", "contains_media", "contains_link",
    "num_tagged_people", "tagged_grok", "used_slang"
]

df = pd.read_csv(csv_path, usecols=columns)
df.head()

Unnamed: 0,commenter_id,comment_id,parent_id,post_id,comment_content,cleaned_content_LSA,cleaned_content_LIWC,num_emojis,num_text_emojis,num_caps_words,num_unicode_chars,contains_media,contains_link,num_tagged_people,tagged_grok,used_slang
0,AdamParkhomenko,1,,1,https://t.co/rAkU7CWOVE,link,link,0,0,0,0,False,True,0,False,False
1,SusanSaoirse,2,1.0,1,"@AdamParkhomenko Thing is, a good number of ma...",thing good number maga use medicaid medicare t...,"thing is, a good number of maga use medicaid o...",0,0,0,0,False,False,0,False,False
2,RealStarTrump,3,1.0,1,@AdamParkhomenko You‚Äôre a lying bastard. https...,you lying bastard link,you are a lying bastard. link,0,0,0,1,False,True,0,False,False
3,catothewis13876,4,1.0,1,@AdamParkhomenko The false premise is that ill...,false premise illegals get less than american ...,the false premise is that illegals get less th...,0,0,0,0,False,True,0,False,False
4,masterson11776,5,1.0,1,@AdamParkhomenko Emergency rooms in So Cal are...,emergency rooms so cal full illegal brown people,emergency rooms in so cal are full of illegal ...,0,0,0,1,False,False,0,False,False


Loaded content are cleaned text. We will perform double check to see if any further cleaning is needed. Here is the checklist:
- Case normalization
- Remove digits and words with digits
- Remove punctuation
- Remove special characters
- Remove extra whitespace
- Handle contractions

In [6]:
import pandas as pd
import re

def validate_cleaning(df, columns_to_check, output_log_path="cleaning_validation_log.csv", digits_log_path="digits_words.txt"):
    """
    Validates that text columns meet specific cleaning criteria:
    - Lowercase
    - No digits
    - No punctuation/special chars
    - No extra whitespace
    - No contractions (implied by no punctuation/apostrophes)
    
    For failures:
    - If "Contains digits", log the word containing digits to a txt file.
    - For other failures (not NaN or digits), log to CSV and list them.
    
    Returns: True if all rows pass, False otherwise.
    """
    
    failed_rows = []
    digits_words = []
    
    # Regex patterns
    pat_digits = re.compile(r'\d')
    # Matches anything that is NOT a word char (a-z) or whitespace. 
    # This catches punctuation, symbols, and apostrophes (contractions).
    pat_punct_special = re.compile(r'[^\w\s]') 
    pat_double_space = re.compile(r'\s{2,}')
    # Pattern to find words containing digits
    pat_word_with_digits = re.compile(r'\b\w*\d\w*\b')

    print(f"Starting validation on columns: {columns_to_check}...")

    for col in columns_to_check:
        for index, row in df.iterrows():
            text = row[col]
            
            # Handle NaN/Float values
            if pd.isna(text):
                # Skip NaN as per user request (do not log)
                continue

            text = str(text)
            reasons = []

            # 1. Case normalization check
            if not text.islower() and text != "":
                reasons.append("Contains uppercase")

            # 2. Digits check
            if pat_digits.search(text):
                # Find words with digits and log them
                words_with_digits = pat_word_with_digits.findall(text)
                digits_words.extend(words_with_digits)
                # Do not add to reasons for CSV logging
            else:
                # Only check other issues if no digits
                # 3. Punctuation / Special Characters / Contractions check
                if pat_punct_special.search(text):
                    reasons.append("Contains punctuation/special chars/apostrophes")

                # 4. Whitespace check
                if text != text.strip():
                    reasons.append("Leading/Trailing whitespace")
                if pat_double_space.search(text):
                    reasons.append("Contains double spaces")

            # If any non-digits reasons failed, log the row
            if reasons:
                failed_rows.append({
                    "row_index": index,
                    "comment_id": row.get("comment_id", "N/A"),
                    "column": col,
                    "original_text": text,
                    "reason": "; ".join(reasons)
                })

    # Log digits words to txt file
    if digits_words:
        with open(digits_log_path, "w", encoding="utf-8") as f:
            for word in set(digits_words):  # Use set to avoid duplicates
                f.write(word + "\n")
        print(f"üìù Words containing digits logged to: {digits_log_path}")

    # Generate Report for other failures
    if failed_rows:
        error_df = pd.DataFrame(failed_rows)
        error_df.to_csv(output_log_path, index=False)
        print(f"‚ùå Validation FAILED.")
        print(f"Found {len(failed_rows)} non-digits issues. Log saved to: {output_log_path}")
        print("Listing all failed cases (non-NaN, non-digits):")
        for _, row in error_df.iterrows():
            print(f"Row {row['row_index']}: {row['original_text']} - Reason: {row['reason']}")
        return False
    else:
        print("‚úÖ Validation PASSED. All text is clean (ignoring digits and NaN).")
        return True

In [7]:
cols_to_validate = ["cleaned_content_LSA"]
is_valid = validate_cleaning(df, cols_to_validate, "cleaning_errors.csv")

Starting validation on columns: ['cleaned_content_LSA']...
üìù Words containing digits logged to: digits_words.txt
‚ùå Validation FAILED.
Found 3 non-digits issues. Log saved to: cleaning_errors.csv
Listing all failed cases (non-NaN, non-digits):
Row 83208: intolerance demanding tolerance--gt very well put i like your wording - Reason: Contains punctuation/special chars/apostrophes
Row 100415: when you make rational conversation impossible you make irrational conversation inevitable-tag - Reason: Contains punctuation/special chars/apostrophes
Row 117680: spot on--facts facts my opinions my own i stand them - Reason: Contains punctuation/special chars/apostrophes


# Install necessary packages

```bash
!pip install nltk
```

Then, download the necessary NLTK resources:

```python
import nltk
nltk.download('words')
```

Beside "words", you might also consider downloading 'punkt' and 'stopwords' if needed for further text processing:

```python
nltk.download('punkt')
nltk.download('stopwords')
```


In [None]:
from nltk.corpus import words
english_vocab = set(words.words())
print('stupid' in english_vocab)  # True