Customer Emotion Analysis System

In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import gensim.downloader as api
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity


# 1. CONFIGURATION & SETUP


# Ensure required packages are installed:
# pip install gensim nltk textblob scikit-learn

# Download required NLTK data
nltk.download("vader_lexicon", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)

# File paths (adjust as needed)
INPUT_CSV = "/content/drive/MyDrive/dataset.csv"
OUTPUT_CSV = "/content/drive/MyDrive/updated_dataset_auto.csv"

# Number of clusters for word clustering
NUM_CLUSTERS = 8

# Predefined emotion categories for Word2Vec matching
EMOTION_CATEGORIES = {
    "joy": ["happy", "joyful", "excited", "cheerful", "delighted"],
    "sadness": ["sad", "depressed", "unhappy", "miserable"],
    "anger": ["angry", "furious", "frustrated", "irritated"],
    "fear": ["scared", "afraid", "terrified", "nervous"],
    "surprise": ["surprised", "amazed", "shocked"],
    "trust": ["trusting", "faithful", "reliable", "hopeful"],
    "disgust": ["disgusted", "nauseated", "repulsed"],
    "anticipation": ["eager", "hopeful", "expectant"],
}


# 2. LOAD DATA

df = pd.read_csv(INPUT_CSV)
df.columns = df.columns.str.strip()  # Clean column names

# Convert rating to numeric and drop rows with missing review or rating
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")
df.dropna(subset=["review", "rating"], inplace=True)


# 3. EXTRACT SENTIMENT WORDS & CLUSTERING


# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

def extract_sentiment_words(text: str) -> list:
    """Return words with significant sentiment (|compound| > 0.3)."""
    words = text.split()
    return [w for w in words if abs(sia.polarity_scores(w)["compound"]) > 0.3]

# Collect all sentiment words from reviews
all_words = []
df["review"].astype(str).apply(lambda x: all_words.extend(extract_sentiment_words(x)))
word_freq = Counter(all_words)
emotion_words = list(word_freq.keys())

# TF-IDF representation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(emotion_words)

# KMeans clustering of emotion words
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
kmeans.fit(X)

# Build a mapping from cluster id to list of words
cluster_to_words = {}
for i, word in enumerate(emotion_words):
    cluster = kmeans.labels_[i]
    cluster_to_words.setdefault(cluster, []).append(word)

print("\n--- Generated Clusters ---")
for cluster, words in cluster_to_words.items():
    print(f"Cluster {cluster}: {words[:10]}{'...' if len(words)>10 else ''}")


# 4. ASSIGN MEANINGFUL EMOTION NAMES USING WORD2VEC


print("\nLoading Word2Vec model...")
word_vectors = api.load("word2vec-google-news-300")  # Pre-trained model

# Compute mean vector for each predefined emotion
emotion_vectors = {}
for emotion, synonyms in EMOTION_CATEGORIES.items():
    valid_syns = [word_vectors[w] for w in synonyms if w in word_vectors]
    if valid_syns:
        emotion_vectors[emotion] = np.mean(valid_syns, axis=0)

def get_cluster_emotion_label(words: list) -> str:
    """Assigns a meaningful emotion label to a cluster using cosine similarity."""
    valid_vecs = [word_vectors[w] for w in words if w in word_vectors]
    if not valid_vecs:
        return "neutral"
    cluster_mean = np.mean(valid_vecs, axis=0)
    best_emotion, best_score = "neutral", -1
    for emotion, em_vec in emotion_vectors.items():
        score = cosine_similarity([cluster_mean], [em_vec])[0][0]
        if score > best_score:
            best_score = score
            best_emotion = emotion
    return best_emotion

# Map each cluster to a meaningful emotion label
cluster_labels = {cluster: get_cluster_emotion_label(words) for cluster, words in cluster_to_words.items()}

print("\n--- Cluster Labels ---")
print(cluster_labels)


# 5. ASSIGN EMOTIONS TO REVIEWS (PRIMARY & SECONDARY)


def assign_emotions_to_text(text: str) -> tuple:
    """
    Assign primary and secondary emotions for a given text.
    Uses the cluster mapping and also considers overall sentiment
    to avoid mismatches (e.g., avoid negative labels if overall review is positive).
    """
    words = text.split()
    overall_sentiment = TextBlob(text).sentiment.polarity  # -1 to 1
    primary_emotion, secondary_emotion = "neutral", "neutral"
    primary_intensity, secondary_intensity = 0.0, 0.0

    for w in words:
        # Get the cluster for this word via TF-IDF representation
        w_vec = vectorizer.transform([w])
        cluster_id = kmeans.predict(w_vec)[0]
        emotion_label = cluster_labels.get(cluster_id, "neutral")

        # Filter based on overall sentiment: if review is positive, skip negative labels
        if overall_sentiment > 0.2 and emotion_label in ["sadness", "anger", "fear", "disgust"]:
            continue
        if overall_sentiment < -0.2 and emotion_label in ["joy", "trust", "anticipation", "surprise"]:
            continue

        if primary_emotion == "neutral" and emotion_label != "neutral":
            primary_emotion = emotion_label
            primary_intensity = round(np.random.uniform(0.6, 1.0), 2)
        elif secondary_emotion == "neutral" and emotion_label != "neutral" and emotion_label != primary_emotion:
            secondary_emotion = emotion_label
            secondary_intensity = round(np.random.uniform(0.3, 0.7), 2)

    # Fallback: if no emotion was assigned from clusters, use overall sentiment
    if primary_emotion == "neutral":
        if overall_sentiment > 0.2:
            primary_emotion = "joy"
            primary_intensity = round(overall_sentiment, 2)
        elif overall_sentiment < -0.2:
            primary_emotion = "sadness"
            primary_intensity = round(abs(overall_sentiment), 2)
    return primary_emotion, primary_intensity, secondary_emotion, secondary_intensity

# Assign topic from category column (if applicable)
df["topic"] = df["category"]

# Apply emotion assignment to each review
df["primary_emotion"], df["primary_intensity"], df["secondary_emotion"], df["secondary_intensity"] = zip(
    *df["review"].astype(str).apply(assign_emotions_to_text)
)


# 6. SENTIMENT & ADORESCORE CALCULATION


def compute_adorescore(rating, sentiment, intensity) -> float:
    """Compute a weighted score using rating, sentiment polarity, and intensity."""
    if pd.isna(rating) or pd.isna(sentiment) or pd.isna(intensity):
        return np.nan
    return round((rating / 5) * 0.4 + (sentiment * 0.3) + (intensity * 0.3), 2)

# Overall sentiment using TextBlob
df["sentiment"] = df["review"].apply(lambda x: TextBlob(x).sentiment.polarity if x.strip() else 0.0)

# Ensure intensities are numeric
df["primary_intensity"] = pd.to_numeric(df["primary_intensity"], errors="coerce")

# Compute AdoreScore for each review
df["adorescore"] = df.apply(
    lambda row: compute_adorescore(row["rating"], row["sentiment"], row["primary_intensity"]), axis=1
)
df["adorescore"].fillna(0, inplace=True)


# 7. SAVE FINAL OUTPUT

df.to_csv(OUTPUT_CSV, index=False)
print(f"\nAll done! Updated dataset saved to: {OUTPUT_CSV}")



--- Generated Clusters ---
Cluster 0: ['pretty', 'easy', 'clean,', 'easily', 'like', 'help', 'lose', 'fit', 'yummy', 'happy']...
Cluster 6: ['good', 'good,', 'good.', 'good!', 'Good', 'GOOD', 'good:', '"Good', '"good']
Cluster 4: ['better', 'better,', 'better!', 'better.', '"better', 'BETTER.', 'Better']
Cluster 3: ['great,', 'great', 'great.', 'Great', 'great!', 'Great!!!', 'GREAT!!', 'GREAT', 'GREAT.', 'Great!']
Cluster 2: ['bonus,', 'bonus', 'bonus?']
Cluster 7: ['love', 'love,', 'Love', 'LOVE', 'love.', 'Love,']
Cluster 5: ['friends,', 'friends.', 'friends', 'friends!', 'Friends']
Cluster 1: ['disappointing.', 'disappointing', 'disappointing!', 'disappointing!!!', 'disappointing,']

Loading Word2Vec model...

--- Cluster Labels ---
{0: 'sadness', 6: 'sadness', 4: 'trust', 3: 'joy', 2: 'joy', 7: 'joy', 5: 'joy', 1: 'sadness'}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["adorescore"].fillna(0, inplace=True)



All done! Updated dataset saved to: /content/drive/MyDrive/updated_dataset_auto.csv


In [None]:
import nltk
nltk.download('stopwords')
import os
import re
import pickle
import numpy as np
import pandas as pd
import nltk
import gensim.downloader as api
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")


# 1. CONFIGURATION & SETUP


nltk.download("vader_lexicon", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)

# File paths
DATASET_CSV = "/content/drive/MyDrive/updated_dataset_auto.csv"
ADORESCORE_MODEL_PICKLE = "/content/drive/MyDrive/adorescore_model.pkl"
TOPIC_MODEL_PICKLE = "/content/drive/MyDrive/topic_classifier.pkl"

# Number of clusters for emotion word clustering
NUM_CLUSTERS = 8

# Predefined emotion categories for Word2Vec mapping
EMOTION_CATEGORIES = {
    "joy": ["happy", "joyful", "excited", "cheerful", "delighted"],
    "sadness": ["sad", "depressed", "unhappy", "miserable"],
    "anger": ["angry", "furious", "frustrated", "irritated"],
    "fear": ["scared", "afraid", "terrified", "nervous"],
    "surprise": ["surprised", "amazed", "shocked"],
    "trust": ["trusting", "faithful", "reliable", "hopeful"],
    "disgust": ["disgusted", "nauseated", "repulsed"],
    "anticipation": ["eager", "hopeful", "expectant"],
}

# Initialize stopwords, lemmatizer, and VADER sentiment analyzer
STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = nltk.WordNetLemmatizer()
SIA = SentimentIntensityAnalyzer()


# 2. HELPER FUNCTIONS


def clean_text(text: str) -> str:
    """Clean text: lowercase, remove URLs/punctuation, tokenize, remove stopwords, and lemmatize."""
    try:
        text = str(text).lower()
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"[^\w\s]", "", text)
        tokens = word_tokenize(text)
        cleaned = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
        return " ".join(cleaned)
    except Exception as e:
        print(f"Text cleaning error: {e}")
        return ""


# Emotion Assignment Pipeline


def extract_sentiment_words(text: str) -> list:
    """Extract words with significant sentiment (|compound| > 0.3)."""
    words = text.split()
    return [w for w in words if abs(SIA.polarity_scores(w)["compound"]) > 0.3]

def build_emotion_clusters(df: pd.DataFrame, review_col: str = "review") -> tuple:
    """
    Extract sentiment words from reviews, vectorize using TF-IDF,
    cluster them with KMeans, and return the vectorizer, KMeans model, and a cluster mapping.
    """
    all_words = []
    df[review_col].astype(str).apply(lambda x: all_words.extend(extract_sentiment_words(x)))
    word_freq = Counter(all_words)
    emotion_words = list(word_freq.keys())

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(emotion_words)

    kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=42, n_init=10)
    kmeans.fit(X)

    cluster_map = {}
    for i, word in enumerate(emotion_words):
        cluster = kmeans.labels_[i]
        cluster_map.setdefault(cluster, []).append(word)
    return vectorizer, kmeans, cluster_map

def load_word2vec_model():
    """Load the pre-trained Word2Vec model (Google News 300)."""
    print("Loading Word2Vec model...")
    return api.load("word2vec-google-news-300")

def build_emotion_vectors(wv_model, emotion_categories: dict) -> dict:
    """For each predefined emotion, compute the mean vector using valid synonyms."""
    emotion_vecs = {}
    for emotion, synonyms in emotion_categories.items():
        valid_vecs = [wv_model[w] for w in synonyms if w in wv_model]
        if valid_vecs:
            emotion_vecs[emotion] = np.mean(valid_vecs, axis=0)
    return emotion_vecs

def get_cluster_emotion_label(cluster_words: list, wv_model, emotion_vecs: dict) -> str:
    """
    Compute the mean embedding for a cluster and return the emotion
    with the highest cosine similarity from the predefined emotion vectors.
    """
    valid_vecs = [wv_model[w] for w in cluster_words if w in wv_model]
    if not valid_vecs:
        return "neutral"
    cluster_mean = np.mean(valid_vecs, axis=0)
    best_emotion, best_score = "neutral", -1
    for emotion, em_vec in emotion_vecs.items():
        score = cosine_similarity([cluster_mean], [em_vec])[0][0]
        if score > best_score:
            best_score = score
            best_emotion = emotion
    return best_emotion

def build_cluster_labels(cluster_map: dict, wv_model, emotion_vecs: dict) -> dict:
    """
    For each cluster in the cluster map, assign a meaningful emotion label
    using the Word2Vec-based similarity approach.
    """
    cluster_labels = {}
    for cluster, words in cluster_map.items():
        label = get_cluster_emotion_label(words, wv_model, emotion_vecs)
        cluster_labels[cluster] = label
    return cluster_labels


# Topic Classifier Functions

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

def train_topic_classifier(input_csv: str = DATASET_CSV, model_pickle: str = TOPIC_MODEL_PICKLE) -> dict:
    """
    Train a topic classifier using the cleaned review text to predict the review's category.
    Uses TF-IDF vectorization and Logistic Regression.
    """
    df = pd.read_csv(input_csv)
    if "review" not in df.columns or "category" not in df.columns:
        raise ValueError("CSV must contain 'review' and 'category' columns.")
    df["cleaned_review"] = df["review"].apply(clean_text)
    df.dropna(subset=["category"], inplace=True)
    X = df["cleaned_review"]
    y = df["category"]

    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    vectorizer_topic = TfidfVectorizer(max_features=5000)
    X_vec = vectorizer_topic.fit_transform(X)

    clf = LogisticRegression(max_iter=1000, random_state=42)
    clf.fit(X_vec, y_encoded)

    topic_model = {"vectorizer": vectorizer_topic, "classifier": clf, "label_encoder": le}
    with open(model_pickle, "wb") as f:
        pickle.dump(topic_model, f)
    print(f"Topic classifier saved to {model_pickle}")
    return topic_model


# Adorescore Model Functions


def train_adorescore_model(input_csv: str = DATASET_CSV, model_pickle: str = ADORESCORE_MODEL_PICKLE) -> tuple:
    """
    Train an XGBoost regression model (with hyperparameter tuning) using SentenceTransformer embeddings
    to predict the adorescore from cleaned reviews.
    """
    df = pd.read_csv(input_csv)
    if "review" not in df.columns or "adorescore" not in df.columns:
        raise ValueError("CSV must contain 'review' and 'adorescore' columns.")
    df["cleaned_review"] = df["review"].apply(clean_text)
    reviews = df["cleaned_review"].tolist()
    y = df["adorescore"].values.astype(float)

    embed_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
    X_text = embed_model.encode(reviews, show_progress_bar=True)

    X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)

    param_grid = {
        "n_estimators": [100, 200],
        "learning_rate": [0.01, 0.05, 0.1],
        "max_depth": [3, 5]
    }
    reg = xgb.XGBRegressor(random_state=42)
    grid = GridSearchCV(reg, param_grid, cv=3, scoring="neg_mean_absolute_error", n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    print("Best parameters for adorescore model:", grid.best_params_)

    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Adorescore Model Performance: MAE={mae:.2f}, R²={r2:.2f}")

    model_data = {"embedding_model": embed_model, "regressor": best_model}
    with open(model_pickle, "wb") as f:
        pickle.dump(model_data, f)
    print(f"Adorescore model saved to {model_pickle}")
    return best_model, embed_model

def main():
    # Load dataset
    df_updated = pd.read_csv(DATASET_CSV)
    print(f"Loaded dataset with {len(df_updated)} records.")

    # Build emotion clustering pipeline
    vectorizer, kmeans, cluster_map = build_emotion_clusters(df_updated, review_col="review")

    # Load Word2Vec model and build emotion vectors
    wv_model = load_word2vec_model()
    emotion_vecs = build_emotion_vectors(wv_model, EMOTION_CATEGORIES)

    # Build cluster labels
    cluster_labels = build_cluster_labels(cluster_map, wv_model, emotion_vecs)
    print("Final Cluster Emotion Labels:", cluster_labels)

    # Train adorescore model
    train_adorescore_model(input_csv=DATASET_CSV)

    # Train topic classifier
    train_topic_classifier(input_csv=DATASET_CSV)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loaded dataset with 1494 records.
Loading Word2Vec model...
Final Cluster Emotion Labels: {0: 'sadness', 6: 'sadness', 4: 'trust', 3: 'joy', 2: 'joy', 7: 'joy', 5: 'joy', 1: 'sadness'}


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Best parameters for adorescore model: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}
Adorescore Model Performance: MAE=0.08, R²=0.23
Adorescore model saved to /content/drive/MyDrive/adorescore_model.pkl
Topic classifier saved to /content/drive/MyDrive/topic_classifier.pkl


In [6]:
import pickle
import re
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sentence_transformers import SentenceTransformer

# Download necessary NLTK resources
nltk.download("punkt", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("vader_lexicon", quiet=True)

# File paths for the pre-trained models
ADORESCORE_MODEL_PICKLE = "/content/drive/MyDrive/adorescore_model.pkl"
TOPIC_MODEL_PICKLE = "/content/drive/MyDrive/topic_classifier.pkl"

# Initialize NLTK objects
SIA = SentimentIntensityAnalyzer()

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    return " ".join(tokens)

def predict_adorescore(review_text: str, regressor, embed_model) -> float:
    cleaned = clean_text(review_text)
    X_emb = embed_model.encode([cleaned])
    score = regressor.predict(X_emb)[0]
    return score

def predict_topic(review_text: str, topic_model: dict) -> str:
    cleaned = clean_text(review_text)
    vec = topic_model["vectorizer"].transform([cleaned])
    pred = topic_model["classifier"].predict(vec)[0]
    topic_label = topic_model["label_encoder"].inverse_transform([pred])[0]
    return topic_label

def load_models():
    # Load adorescore model (regressor + embedding model)
    with open(ADORESCORE_MODEL_PICKLE, "rb") as f:
        adorescore_data = pickle.load(f)
    regressor = adorescore_data["regressor"]
    embed_model = adorescore_data["embedding_model"]

    # Load topic classifier model
    with open(TOPIC_MODEL_PICKLE, "rb") as f:
        topic_model = pickle.load(f)

    return regressor, embed_model, topic_model

def main():
    regressor, embed_model, topic_model = load_models()
    review_input = input("Enter your review: ")

    # Generate predictions
    adorescore = predict_adorescore(review_input, regressor, embed_model)
    topic = predict_topic(review_input, topic_model)
    overall_sentiment = TextBlob(review_input).sentiment.polarity

    # Display results
    print("\n--- Prediction Results ---")
    print(f"Predicted AdoreScore: {adorescore:.2f}")
    print(f"Overall Sentiment: {overall_sentiment:.2f}")
    print(f"Predicted Topic/Category: {topic}")

    # Optionally, save results to CSV
    pred_df = pd.DataFrame([{
        "review": review_input,
        "predicted_adorescore": adorescore,
        "overall_sentiment": overall_sentiment,
        "predicted_topic": topic
    }])
    pred_df.to_csv("/content/drive/MyDrive/final_predictions.csv", index=False)
    print("Prediction saved to final_predictions.csv")

if __name__ == "__main__":
    main()


Enter your review: The pizza was absolutely delicious! The crust was perfectly crispy, the cheese was gooey, and the toppings were fresh. I’ll definitely order again!

--- Prediction Results ---
Predicted AdoreScore: 0.70
Overall Sentiment: 0.57
Predicted Topic/Category: Grocery & Gourmet Food
Prediction saved to final_predictions.csv
