In [None]:
import os
import re
from docx import Document
from textblob import TextBlob
import nltk


In [None]:
# Ensure necessary NLTK corpora are downloaded
nltk.download('punkt')

In [None]:
def read_docx(file_path):
    """Reads all text from a .docx file."""
    doc = Document(file_path)
    full_text = []
    for para in doc.paragraphs:
        full_text.append(para.text)
    return '\n'.join(full_text)

In [None]:
def write_docx(text, output_path):
    """Writes cleaned text to a .docx file."""
    doc = Document()
    for paragraph in text.split('\n'):
        doc.add_paragraph(paragraph)
    doc.save(output_path)

In [None]:
def remove_metadata_and_watermarks(text):
    """Remove common AI watermark and metadata patterns."""
    watermark_patterns = [
        r"This content was generated by.*?\.",       # Common disclaimers
        r"Generated by ChatGPT.*?\.",               # AI model mentions
        r"OpenAI.*?\.",                             # Brand mentions
        r"AI-generated content",                    # Generic terms
        r"\[\d{4}-\d{2}-\d{2}.*?\]",                # Date/time patterns
    ]
    for pattern in watermark_patterns:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE | re.MULTILINE)
    return text.strip()

In [None]:
def humanize_text(text):
    """Attempts to humanize AI-like patterns."""
    sentences = nltk.sent_tokenize(text)
    naturalized = []
    for sentence in sentences:
        blob = TextBlob(sentence)
        # Rewriting with slight variation (correction, casual tone)
        corrected = blob.correct()
        # Add a touch of variation
        if len(corrected.words) > 5:
            corrected = corrected.replace("This is", "Here's") \
                                 .replace("It is", "It's") \
                                 .replace("Do not", "Don't")
        naturalized.append(str(corrected))
    return ' '.join(naturalized)

In [None]:
def clean_and_humanize_doc(input_path, output_path):
    """Main pipeline to clean and rewrite document content."""
    try:
        print("Reading document...")
        text = read_docx(input_path)
        
        print("Removing metadata and watermarks...")
        cleaned = remove_metadata_and_watermarks(text)
        
        print("Humanizing text...")
        humanized = humanize_text(cleaned)
        
        print("Writing final document...")
        write_docx(humanized, output_path)

        print(f"Done! Cleaned document saved at: {output_path}")
    except Exception as e:
        print(f"❌ Error: {e}")

In [None]:
# ========== RUNNING EXAMPLE ==========
if __name__ == "__main__":
    input_docx = "input.docx"   # Replace with your actual input file
    output_docx = "cleaned_output.docx"
    clean_and_humanize_doc(input_docx, output_docx)