<a href="https://colab.research.google.com/github/ranwiththecode/high-fantasy-data-analysis/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from tqdm import tqdm
import os
from google.colab import drive
import time

# Mount Google Drive
drive.mount('/content/drive')

# Initialize sentiment analyzer
nltk.download('vader_lexicon', quiet=True)
sid = SentimentIntensityAnalyzer()

# Configuration (UPDATE THESE VALUES)
CONFIG = {
    "input_file": "/content/drive/MyDrive/Goodreads_Data/tricksters_choice_clean.csv",
    "common_keywords": [
        "protagonist", "main character", "mc", "lead character", "main protagonist"
    ],
    "book_specific_keywords": [
        "female main character", "female protagonist", "FMC", "hero", "heroine", "Aly", "Daughter of the Lionness"
    ],
    "timeout_seconds": 300
}

def preprocess_text(text):
    """Normalize text for better matching"""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = text.replace("’", "'")  # Handle curly apostrophes
    text = text.replace("-", " ")  # Handle hyphenated words
    return text

def expand_keyword_variants(keywords):
    """Generate plural and possessive forms"""
    expanded = []
    for kw in keywords:
        kw = kw.lower()
        expanded.append(kw)
        if not kw.endswith("s"):
            expanded.append(kw + "s")  # plural
        if not kw.endswith("'s"):
            expanded.append(kw + "'s")  # possessive
        if kw.endswith("s") and not kw.endswith("s'"):
            expanded.append(kw + "'")  # plural possessive
    return list(set(expanded))  # Remove duplicates

def extract_keyword_context(text, keywords):
    """Robust keyword extraction with variant matching"""
    text = preprocess_text(text)
    if not text:
        return []

    # Expand keywords to include variants
    expanded_keywords = expand_keyword_variants(keywords)

    # Create regex pattern that matches any variant
    pattern = r'\b(?:' + '|'.join(map(re.escape, expanded_keywords)) + r')\b'

    # Split sentences while preserving punctuation
    sentences = re.split(r'(?<=[.!?])\s+', text)

    # Find sentences containing any keyword variant
    matches = [s for s in sentences if re.search(pattern, s)]
    return matches

def analyze_sentiment_for_category(df, keywords, rating_filter=None, desc=""):
    """Analyze sentiment for a specific review subset"""
    start_time = time.time()

    # Filter reviews
    if rating_filter is not None:
        subset = df[df['rating'].isna()] if pd.isna(rating_filter) else df[df['rating'] == rating_filter]
    else:
        subset = df

    if subset.empty:
        print(f"No reviews found for {desc}")
        return None

    # Extract keyword sentences with progress tracking
    keyword_sentences = []
    progress_bar = tqdm(subset['text'], desc=f"Processing {desc}")

    for text in progress_bar:
        if time.time() - start_time > CONFIG['timeout_seconds']:
            print(f"\nTimeout reached for {desc}")
            break

        matches = extract_keyword_context(text, keywords)
        keyword_sentences.extend(matches)
        progress_bar.set_postfix({'matches': len(keyword_sentences)})

    # Debug output
    if not keyword_sentences:
        print("\nDebug: No matches found. Sample reviews:")
        for i, text in enumerate(subset['text'].sample(min(3, len(subset))), 1):
            print(f"\nSample {i}:")
            print(text[:200] + ("..." if len(text) > 200 else ""))
        return None

    # Calculate combined sentiment
    combined_text = ' '.join(keyword_sentences)
    try:
        sentiment = sid.polarity_scores(combined_text)['compound']
        print(f"\nFound {len(keyword_sentences)} matches for {desc}")
        return round(sentiment, 4)
    except Exception as e:
        print(f"\nSentiment calculation failed: {str(e)}")
        return None

def analyze_book_sentiment():
    """Main analysis function"""
    # Load and validate data
    try:
        print(f"\nLoading data from: {CONFIG['input_file']}")
        df = pd.read_csv(CONFIG['input_file'])

        if df.empty:
            print("Error: Empty DataFrame")
            return None

        # Check required columns
        for col in ['text', 'rating']:
            if col not in df.columns:
                print(f"Error: Missing column '{col}'")
                return None

        # Clean ratings
        df['rating'] = df['rating'].replace(['NULL', 'null', 'NA', ''], pd.NA)
        print(f"Loaded {len(df)} reviews ({df['rating'].isna().sum()} without ratings)")

    except Exception as e:
        print(f"\nFailed to load data: {str(e)}")
        return None

    # Run all analyses
    results = {}
    analyses = [
        ('all_reviews', None, "All reviews"),
        ('null_rated', pd.NA, "NULL-rated reviews"),
        ('common_keywords', None, "Common keywords"),
        ('book_keywords', None, "Book keywords")
    ]

    for key, rating_filter, desc in analyses:
        print(f"\n=== Analyzing {desc} ===")
        keywords = (
            CONFIG['common_keywords'] if "common" in desc.lower()
            else CONFIG['book_specific_keywords'] if "book" in desc.lower()
            else CONFIG['common_keywords'] + CONFIG['book_specific_keywords']
        )

        results[key] = analyze_sentiment_for_category(
            df, keywords, rating_filter, desc
        )
        print(f"Result: {results[key] if results[key] is not None else 'N/A'}")

    # Save and display results
    results['book_title'] = os.path.splitext(os.path.basename(CONFIG['input_file']))[0]
    results_df = pd.DataFrame([results])

    output_path = os.path.splitext(CONFIG['input_file'])[0] + '_sentiment_results.csv'
    results_df.to_csv(output_path, index=False)

    print("\n=== FINAL RESULTS ===")
    print(results_df.transpose())
    print(f"\nResults saved to: {output_path}")

    return results

# Run the analysis
if __name__ == "__main__":
    final_results = analyze_book_sentiment()