<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import re
import pandas as pd
from google.colab import drive
from tqdm import tqdm

# Mount Google Drive
drive.mount('/content/drive')

# Expanded emoji dictionary
EMOJI_DICT = {
    "😊": "[smiling_face]",
    "😂": "[laughing_face]",
    "❤️": "[heart]",
    "😍": "[heart_eyes]",
    "😒": "[unamused]",
    "👍": "[thumbs_up]",
    "🙏": "[praying_hands]",
    "💕": "[two_hearts]",
    "😘": "[kissing_face]",
    "😎": "[cool_face]",
    "🤦‍♀️": "[facepalm]",
    "🥶": "[cold_face]",
    "⭐": "[star]",
    "📚": "[books]",
    "✨": "[sparkles]",
    "🎭": "[drama]",
    "🔥": "[fire]",
    "🤔": "[thinking]",
    "😢": "[crying]",
    "😡": "[angry]",
    "🤩": "[starstruck]",
    # Add more as needed
}

def preprocess_text(text):

    if not isinstance(text, str):
        return ""

    # Basic HTML tag removal (without BeautifulSoup)
    clean_text = re.sub(r'<[^>]+>', '', text)

    # Standardize whitespace
    clean_text = re.sub(r'\s+', ' ', clean_text).strip()

    # Convert emojis to text descriptions
    for emoji, desc in EMOJI_DICT.items():
        clean_text = clean_text.replace(emoji, desc)

    return clean_text

def is_english(text, min_english_chars=0.7):
    """Simple English detection without langdetect"""
    if not isinstance(text, str):
        return False

    # Count ASCII characters (basic English detection)
    english_chars = sum(1 for c in text if ord(c) < 128)
    total_chars = max(1, len(text))  # avoid division by zero

    return (english_chars / total_chars) >= min_english_chars

def preprocess_reviews_file(input_path, output_path):
    """Process the entire CSV file with progress tracking"""
    # Read the raw data
    try:
        df = pd.read_csv(input_path)
    except Exception as e:
        print(f"Error reading file: {e}")
        return False

    # Ensure we have the expected columns
    if 'text' not in df.columns:
        print("Error: 'text' column not found in input file")
        return False

    # Preprocess with progress bar
    tqdm.pandas(desc="Preprocessing text")
    df['clean_text'] = df['text'].progress_apply(preprocess_text)

    # Filter English reviews
    tqdm.pandas(desc="Checking language")
    df['is_english'] = df['clean_text'].progress_apply(is_english)
    english_df = df[df['is_english']].copy()

    # Save cleaned data
    try:
        english_df.to_csv(output_path, index=False, encoding='utf-8')
        print(f"\nSuccess! Cleaned data saved to: {output_path}")
        print(f"Original reviews: {len(df)} | English reviews: {len(english_df)}")
        return True
    except Exception as e:
        print(f"Error saving file: {e}")
        return False

# Example usage in Colab
if __name__ == "__main__":
    # Set your file paths
    BOOK_TITLE = "magic_steeped_in_poison"  # Change this to match your file name
    INPUT_PATH = f'/content/drive/MyDrive/Goodreads_Data/{BOOK_TITLE}_raw.csv'
    OUTPUT_PATH = f'/content/drive/MyDrive/Goodreads_Data/{BOOK_TITLE}_clean.csv'

    # Run preprocessing
    print(f"Starting preprocessing for {BOOK_TITLE}...")
    success = preprocess_reviews_file(INPUT_PATH, OUTPUT_PATH)

    if success:
        # Show sample of cleaned data
        cleaned_df = pd.read_csv(OUTPUT_PATH)
        print("\nSample of cleaned reviews:")
        print(cleaned_df[['text', 'clean_text']].head().to_string(index=False))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Starting preprocessing for magic_steeped_in_poison...


Preprocessing text: 100%|██████████| 6810/6810 [00:00<00:00, 33150.01it/s]
Checking language: 100%|██████████| 6810/6810 [00:00<00:00, 36195.27it/s]



Success! Cleaned data saved to: /content/drive/MyDrive/Goodreads_Data/magic_steeped_in_poison_clean.csv
Original reviews: 6810 | English reviews: 6790

Sample of cleaned reviews:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    