## Use Case 1 â€” Customer Sentiment Analysis (Reviews + Tweets)

In [1]:
import pandas as pd
from collections import Counter
import re

# Load dataset
df = pd.read_excel("C:\Semester 3\Text, Web & Sen Analytics\Capston_Project-Priya_Saini(24MBMB34)\ecom_analytics_data.xlsx")



df.head(20)

# 1. Create sentiment from rating
def rating_to_sentiment(r):
    if r <= 2: return "negative"
    if r == 3: return "neutral"
    return "positive"

df['rating_sentiment'] = df['review_rating'].apply(rating_to_sentiment)

# 2. Simple lexicon-based text sentiment (tiny lexicon)
positive_terms = set(["great","excellent","fast delivery","fast","highly recommend","satisfied","love","worth"])
negative_terms = set(["poor","late","damaged","not as described","disappointed","worst","bad","return","late delivery","packing"])

def text_sentiment(text):
    t = str(text).lower()
    pos = sum(1 for p in positive_terms if p in t)
    neg = sum(1 for n in negative_terms if n in t)
    if pos > neg: return "positive"
    if neg > pos: return "negative"
    return "neutral"

df['review_text_sentiment'] = df['review_text'].apply(text_sentiment)
df['tweet_text_sentiment'] = df['tweet_text'].apply(text_sentiment)

# 3. Quick metrics
print("Rating sentiment distribution:\n", df['rating_sentiment'].value_counts(normalize=True))
print("\nReview text sentiment distribution:\n", df['review_text_sentiment'].value_counts(normalize=True))

# 4. Top negative words in negative reviews (simple tokenization)
neg_reviews = " ".join(df[df['review_text_sentiment']=="negative"]['review_text'].astype(str).tolist()).lower()
tokens = re.findall(r"\b[a-z]{3,}\b", neg_reviews)
top_neg = Counter(tokens).most_common(20)
print("\nTop tokens in negative reviews:", top_neg[:10])

# 5. Flag at-risk users (negative review OR negative tweet OR rating 1-2)
df['is_at_risk'] = ((df['rating_sentiment']=="negative") | 
                    (df['review_text_sentiment']=="negative") |
                    (df['tweet_text_sentiment']=="negative"))
at_risk_users = df[df['is_at_risk']].groupby('user_id').size().sort_values(ascending=False)
print("\nTop at-risk users:\n", at_risk_users.head())


  df = pd.read_excel("C:\Semester 3\Text, Web & Sen Analytics\Capston_Project-Priya_Saini(24MBMB34)\ecom_analytics_data.xlsx")


Rating sentiment distribution:
 rating_sentiment
positive    0.70
neutral     0.18
negative    0.12
Name: proportion, dtype: float64

Review text sentiment distribution:
 review_text_sentiment
positive    0.61
neutral     0.27
negative    0.12
Name: proportion, dtype: float64

Top tokens in negative reviews: [('will', 6), ('return', 6), ('bad', 5), ('experience', 5), ('contacted', 4), ('support', 4), ('very', 3), ('poor', 3), ('quality', 3), ('not', 3)]

Top at-risk users:
 user_id
U212    3
U217    2
U211    2
U238    2
U234    2
dtype: int64
