<a href="https://colab.research.google.com/github/pranjalmuchandi/ml-notebook-colab/blob/main/Aspect%E2%80%91Based_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Aspect‑Based Sentiment Analysis

# Importing libraries

In [11]:
import pandas as pd
import numpy as np
import re
import joblib
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords


# Download required NLTK data

In [13]:
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
print("Complete Movie ABSA System Starting...")

Complete Movie ABSA System Starting...


# Uploading the dataset

In [23]:
def generate_movie_dataset(n_samples=3000):  # Increased to 3000 for better coverage
    """Generate DIVERSE movie reviews with 50+ realistic templates"""
    np.random.seed(42)

    # 50+ DIVERSE MOVIE REVIEW TEMPLATES (all aspect combinations)
    templates = [
        # === POSITIVE DOMINANT ===
        {"text": "Phenomenal acting, brilliant story, masterful direction.", "acting": 1, "story": 1, "direction": 1, "music": 1, "visuals": 1},
        {"text": "Outstanding performances elevated a solid screenplay.", "acting": 1, "story": 1, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Direction was genius, visuals breathtaking.", "acting": 0, "story": 0, "direction": 1, "music": 0, "visuals": 1},
        {"text": "Music score was hauntingly beautiful throughout.", "acting": 0, "story": 0, "direction": 0, "music": 1, "visuals": 0},

        # === NEGATIVE DOMINANT ===
        {"text": "Wooden acting ruined the entire experience.", "acting": -1, "story": 0, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Predictable plot with terrible pacing.", "acting": 0, "story": -1, "direction": -1, "music": 0, "visuals": 0},
        {"text": "Direction was amateurish and confusing.", "acting": 0, "story": 0, "direction": -1, "music": 0, "visuals": -1},
        {"text": "Music was intrusive and mismatched.", "acting": 0, "story": 0, "direction": 0, "music": -1, "visuals": 0},

        # === MIXED SENTIMENTS (most realistic) ===
        {"text": "Great acting but story felt predictable.", "acting": 1, "story": -1, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Stunning visuals saved a weak storyline.", "acting": 0, "story": -1, "direction": 0, "music": 0, "visuals": 1},
        {"text": "Solid direction despite poor acting.", "acting": -1, "story": 0, "direction": 1, "music": 0, "visuals": 0},
        {"text": "Brilliant music but direction fell flat.", "acting": 0, "story": 0, "direction": -1, "music": 1, "visuals": 0},

        # === GENRE-SPECIFIC ===
        {"text": "Action sequences brilliant, story paper-thin.", "acting": 0, "story": -1, "direction": 1, "music": 1, "visuals": 1},
        {"text": "Horror visuals terrifying, acting laughable.", "acting": -1, "story": 0, "direction": 0, "music": 1, "visuals": 1},
        {"text": "Romantic chemistry electric, plot cliché.", "acting": 1, "story": -1, "direction": 0, "music": 1, "visuals": 0},
        {"text": "Sci-fi concepts brilliant, execution poor.", "acting": 0, "story": 1, "direction": -1, "music": 0, "visuals": 1},

        # === CRITIC-STYLE REVIEWS ===
        {"text": "Tour de force performances in character-driven drama.", "acting": 1, "story": 1, "direction": 1, "music": 0, "visuals": 1},
        {"text": "Visually arresting but narratively hollow.", "acting": 0, "story": -1, "direction": 1, "music": 0, "visuals": 1},
        {"text": "Masterclass in filmmaking craftsmanship.", "acting": 1, "story": 1, "direction": 1, "music": 1, "visuals": 1},
        {"text": "Technical brilliance undermined by weak script.", "acting": 0, "story": -1, "direction": 1, "music": 1, "visuals": 1},

        # === MORE VARIATIONS (20+ additional) ===
        {"text": "Lead performance career-best, supporting cast weak.", "acting": 1, "story": 0, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Twist ending saved mediocre first half.", "acting": 0, "story": 1, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Cinematography award-worthy, editing choppy.", "acting": 0, "story": 0, "direction": 0, "music": 0, "visuals": 1},
        {"text": "Score perfectly captured emotional beats.", "acting": 0, "story": 1, "direction": 1, "music": 1, "visuals": 0},
        {"text": "Pacing issues killed momentum despite good cast.", "acting": 1, "story": 0, "direction": -1, "music": 0, "visuals": 0},
        {"text": "Third act collapse after strong setup.", "acting": 0, "story": -1, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Production design immersive and detailed.", "acting": 0, "story": 0, "direction": 1, "music": 0, "visuals": 1},
        {"text": "Dialogue felt unnatural despite good premise.", "acting": 0, "story": -1, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Ensemble cast delivered uniformly strong work.", "acting": 1, "story": 1, "direction": 1, "music": 0, "visuals": 0},
        {"text": "Sound design exceptional for genre.", "acting": 0, "story": 0, "direction": 0, "music": 1, "visuals": 1},

        # === EDGE CASES ===
        {"text": "Perfectly adequate, nothing memorable.", "acting": 0, "story": 0, "direction": 0, "music": 0, "visuals": 0},
        {"text": "Technically proficient but emotionally cold.", "acting": 0, "story": -1, "direction": 1, "music": 0, "visuals": 1},
        {"text": "Ambitious swing that didn't quite land.", "acting": 1, "story": 0, "direction": 0, "music": 1, "visuals": 1},
        {"text": "Guilty pleasure with terrible writing.", "acting": 1, "story": -1, "direction": 0, "music": 1, "visuals": 0},
    ]

    data = []
    variations = [
        " Fantastic film!", " Disappointing.", " Worth watching.", " Masterpiece!", " Total waste.",
        " Surprisingly good.", " Highly recommended.", " Skip this one.", " Cult classic potential.",
        " Oscar-worthy.", " Box office bomb.", " Critics loved it.", " Audience favorite.",
        " Visually stunning.", " Emotionally resonant.", " Intellectually stimulating.", ""
    ]

    for i in range(n_samples):
        template = templates[i % len(templates)]
        review = template['text'] + np.random.choice(variations)

        row = {'review': review}
        for aspect in ['acting', 'story', 'direction', 'music', 'visuals']:
            # Realistic noise: 8% class flip probability
            true_sent = template[aspect]
            noisy_sent = true_sent + np.random.choice([-1, 0, 1], p=[0.04, 0.92, 0.04])
            row[f'sentiment_{aspect}'] = np.clip(noisy_sent, -1, 1)
        data.append(row)

    df = pd.DataFrame(data)
    df.to_csv('movie_reviews_perfect_aspects.csv', index=False)

    print(f" ENHANCED Movie dataset created: {df.shape[0]} reviews")
    print(f" Template diversity: {len(templates)} unique patterns")
    print("\nLabel balance check:")
    for aspect in ['acting', 'story', 'direction', 'music', 'visuals']:
        print(f"  {aspect}: {df[f'sentiment_{aspect}'].value_counts().sort_index().to_dict()}")

    return df


# Cleaning the dataset

In [24]:
def advanced_clean_text(text):
    text = str(text).lower()
    text = re.sub(r"<.*?>", " ", text)  # HTML tags
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # keep letters, numbers, spaces
    text = re.sub(r"\s+", " ", text).strip()

    # Remove stopwords and short words
    words = text.split()
    text = ' '.join([w for w in words if w not in stop_words and len(w) > 2])
    return text

#Training and testing the dataset

In [25]:
def train_aspect_models(df):
    df['clean_review'] = df['review'].apply(advanced_clean_text)
    X_text = df['clean_review'].values

    # Shared TF-IDF with optimal params
    tfidf = TfidfVectorizer(
        max_features=15000,
        ngram_range=(1, 3),
        min_df=3,
        sublinear_tf=True,
        lowercase=True
    )
    X = tfidf.fit_transform(X_text)
    print(f" TF-IDF vocabulary size: {len(tfidf.vocabulary_)}")

    aspect_cols = ['acting', 'story', 'direction', 'music', 'visuals']
    aspect_models = {}

    print("\n Training aspect classifiers...")
    for aspect in aspect_cols:
        print(f"Training {aspect}...")

        col = f'sentiment_{aspect}'
        df_aspect = df.dropna(subset=[col])
        y = df_aspect[col].astype(int).values

        X_aspect = tfidf.transform(df_aspect['clean_review'].values)
        X_train, X_test, y_train, y_test = train_test_split(
            X_aspect, y, test_size=0.2, random_state=42, stratify=y)

        # Use best model (Logistic Regression works great for text)
        clf = LogisticRegression(C=2.0, max_iter=2000, n_jobs=-1)
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        print(f"{aspect} F1-score: {classification_report(y_test, y_pred, output_dict=True)['weighted avg']['f1-score']:.3f}")

        aspect_models[aspect] = clf

    # Save everything
    joblib.dump(tfidf, 'movie_absa_tfidf.pkl')
    joblib.dump(aspect_models, 'movie_absa_models.pkl')

    print("All models saved!")
    return tfidf, aspect_models

# Predicting for new users

In [26]:
def predict_movie_aspects(review_text, tfidf, aspect_models):
    label_map = {-1: " Negative", 0: "➖ Neutral", 1: " Positive"}

    clean = advanced_clean_text(review_text)
    X_vec = tfidf.transform([clean])

    results = {}
    for aspect, model in aspect_models.items():
        pred = model.predict(X_vec)[0]
        confidence = model.predict_proba(X_vec)[0].max()
        results[aspect] = {
            'sentiment': label_map[pred],
            'confidence': f"{confidence:.1%}"
        }

    return results, clean

# Main Execution

In [None]:
def main():
    print("="*60)
    print(" MOVIE REVIEW ASPECT SENTIMENT ANALYSIS")
    print("IEEE Topic: Aspect-Based Sentiment Analysis of Movie Reviews")
    print("="*60)

    # Step 1: Generate dataset
    if not os.path.exists('movie_reviews_perfect_aspects.csv'):
        df = generate_movie_dataset()
    else:
        df = pd.read_csv('movie_reviews_perfect_aspects.csv')
        print(f" Loaded existing dataset: {df.shape[0]} reviews")

    # Step 2: Train models
    if os.path.exists('movie_absa_models.pkl') and os.path.exists('movie_absa_tfidf.pkl'):
        print(" Loading existing models...")
        tfidf = joblib.load('movie_absa_tfidf.pkl')
        aspect_models = joblib.load('movie_absa_models.pkl')
    else:
        tfidf, aspect_models = train_aspect_models(df)

    # Step 3: Interactive prediction loop
    print("\n READY FOR PREDICTIONS!")
    print("Type 'quit' to exit")
    print("-" * 40)

    while True:
        review = input("\n Enter your movie review: ").strip()
        if review.lower() in ['quit', 'exit', 'q']:
            break

        if not review:
            print("Please enter a review!")
            continue

        results, cleaned = predict_movie_aspects(review, tfidf, aspect_models)

        print("\n ASPECT ANALYSIS RESULTS:")
        print("=" * 40)
        for aspect, res in results.items():
            print(f" {aspect.capitalize():<10} | {res['sentiment']:<12} | Confidence: {res['confidence']}")
        print(f"\n Cleaned review: {cleaned[:100]}...")
        print("-" * 40)

if __name__ == "__main__":
    main()

 MOVIE REVIEW ASPECT SENTIMENT ANALYSIS
IEEE Topic: Aspect-Based Sentiment Analysis of Movie Reviews
 Loaded existing dataset: 2000 reviews
 Loading existing models...

 READY FOR PREDICTIONS!
Type 'quit' to exit
----------------------------------------

 ASPECT ANALYSIS RESULTS:
 Acting     |  Positive    | Confidence: 96.0%
 Story      |  Negative    | Confidence: 96.9%
 Direction  | ➖ Neutral    | Confidence: 91.1%
 Music      | ➖ Neutral    | Confidence: 93.6%
 Visuals    | ➖ Neutral    | Confidence: 88.7%

 Cleaned review: acting phenomenal story felt predictable boring...
----------------------------------------
