<a href="https://colab.research.google.com/github/sonupatel24/Pharmacy-review-sentiment-analysis/blob/main/Text_Classification_(LSTM%2CTransformers).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import warnings
from transformers import pipeline
from tqdm import tqdm
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
# STEP 1: Load your Excel file
# ==============================
def load_data(file_path):
    """/content/PharmEasy_dataset.xlsx"""
    try:
        df = pd.read_excel(file_path)
        df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
        print(f"✅ Loaded {len(df)} reviews successfully")
        return df
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return None
    except Exception as e:
        print(f"❌ Error loading file: {e}")
        return None

In [3]:
# STEP 2: Enhanced Category Keywords
# ==============================
category_keywords = {
    "Customer Care and Support": [
        "support", "customer care", "customer support", "support team",
        "helpline", "service team", "representative", "staff", "response"
    ],
    "Delivery": [
        "delivery", "deliver", "delay", "shipping", "courier",
        "delivered", "late", "on time", "fast delivery", "slow delivery",
        "tracking", "package", "parcel"
    ],
    "Application Experience": [
        "app", "interface", "crash", "bug", "service", "website",
        "navigation", "ui", "ux", "user experience", "loading",
        "slow", "fast", "easy to use", "difficult", "glitch"
    ],
    "Price": [
        "price", "cost", "expensive", "discount", "costly", "pricing",
        "overpriced", "fees", "billing", "charge", "cheap", "affordable",
        "value", "money", "payment", "mrp", "rate"
    ],
    "Refund Policy": [
        "refund", "return", "refund policy", "replacement", "money back",
        "return policy", "refunded", "cancellation", "cancel", "exchange",
        "reimbursement", "claim"
    ],
    "Stock Availability": [
        "stock", "out of stock", "availability", "not available",
        "no stock", "unavailable", "in stock", "shortage", "supply"
    ],
    "Product Quality": [
        "quality", "genuine", "fake", "authentic", "expired", "expiry",
        "fresh", "packaging", "condition", "damaged", "broken", "sealed"
    ],
    "Overall Experience": [
        "experience", "satisfied", "happy", "disappointed", "recommend",
        "terrible", "excellent", "good", "bad", "worst", "best", "amazing"
    ]
}


In [4]:
# STEP 3: Enhanced Text Cleaning
# ==============================
def clean_text(text):
    """Clean and normalize text"""
    if pd.isna(text):
        return ""

    text = str(text).lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    # Remove special characters but keep spaces and basic punctuation
    text = re.sub(r"[^a-z0-9\s.,!?]", " ", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

In [5]:
# STEP 4: Load Sentiment Model with Error Handling
# ==============================
def load_sentiment_model():
    """Load sentiment analysis model with error handling"""
    try:
        sentiment_analyzer = pipeline(
            "sentiment-analysis",
            model="distilbert-base-uncased-finetuned-sst-2-english",
            device=-1  # Use CPU (-1) or GPU (0)
        )
        print("✅ Sentiment model loaded successfully")
        return sentiment_analyzer
    except Exception as e:
        print(f"❌ Error loading sentiment model: {e}")
        return None

In [6]:
# STEP 5: Enhanced Aspect Detection with Context
# ==============================
def analyze_review_strict(text, sentiment_analyzer):
    """
    Analyze review for aspects and sentiment with strict keyword matching
    """
    result = {}

    for category, keywords in category_keywords.items():
        found = []
        matched_sentences = []

        for kw in keywords:
            # Regex for whole word matching (case insensitive)
            pattern = r'\b' + re.escape(kw) + r'\b'
            if re.search(pattern, text, re.IGNORECASE):
                found.append(kw)

                # Extract sentence containing the keyword for better context
                sentences = text.split('.')
                for sent in sentences:
                    if re.search(pattern, sent, re.IGNORECASE):
                        matched_sentences.append(sent.strip())

        if found:
            # Use matched sentences or keywords for sentiment analysis
            text_for_sentiment = " ".join(matched_sentences[:3]) if matched_sentences else " ".join(found)

            try:
                sentiment = sentiment_analyzer(text_for_sentiment[:512])[0]  # Limit to 512 tokens
                label = sentiment["label"]
                confidence = sentiment["score"]
                vector = 1 if label == "POSITIVE" else -1

                result[category] = {
                    "keywords": ", ".join(set(found)),  # Remove duplicates
                    "vector": vector,
                    "confidence": round(confidence, 2),
                    "sentiment": label
                }
            except Exception as e:
                result[category] = {
                    "keywords": ", ".join(set(found)),
                    "vector": "",
                    "confidence": "",
                    "sentiment": "ERROR"
                }
        else:
            result[category] = {
                "keywords": "",
                "vector": "",
                "confidence": "",
                "sentiment": ""
            }

    return result


In [7]:
# STEP 6: Process All Reviews with Progress Bar
# ==============================
def process_reviews(df, sentiment_analyzer):
    """Process all reviews with progress tracking"""
    records = []

    print("\n📊 Processing reviews...")
    for review in tqdm(df["clean_text"], desc="Analyzing"):
        res = analyze_review_strict(review, sentiment_analyzer)
        records.append(res)

    return records

In [8]:
# STEP 7: Create Final DataFrame
# ==============================
def create_final_dataframe(df, records):
    """Create final dataframe with all columns"""
    final_rows = []

    for i, r in enumerate(records):
        row = {"Reviews": df.loc[i, "reviews"]}

        for cat in category_keywords.keys():
            row[f"{cat}_Keywords"] = r[cat]["keywords"]
            row[f"{cat}_Sentiment"] = r[cat]["sentiment"]
            row[f"{cat}_Vector"] = r[cat]["vector"]
            row[f"{cat}_Confidence"] = r[cat]["confidence"]

        final_rows.append(row)

    final_df = pd.DataFrame(final_rows)
    return final_df

In [9]:
# STEP 8: Generate Summary Statistics
# ==============================
def generate_summary(final_df):
    """Generate summary statistics"""
    print("\n" + "="*50)
    print("📈 SENTIMENT ANALYSIS SUMMARY")
    print("="*50)

    for cat in category_keywords.keys():
        vector_col = f"{cat}_Vector"
        if vector_col in final_df.columns:
            vectors = final_df[vector_col].replace("", np.nan).dropna()
            if len(vectors) > 0:
                positive = (vectors == 1).sum()
                negative = (vectors == -1).sum()
                total = len(vectors)

                print(f"\n{cat}:")
                print(f"  ✅ Positive: {positive} ({positive/total*100:.1f}%)")
                print(f"  ❌ Negative: {negative} ({negative/total*100:.1f}%)")
                print(f"  📊 Total mentions: {total}")


In [10]:

# MAIN EXECUTION
# ==============================
def main():
    """Main execution function"""

    # Load data
    df = load_data("/content/PharmEasy_dataset.xlsx")
    if df is None:
        return

    # Check if 'reviews' column exists
    if 'reviews' not in df.columns:
        print(f"❌ 'reviews' column not found. Available columns: {df.columns.tolist()}")
        return

    # Clean text
    print("\n🧹 Cleaning text...")
    df["clean_text"] = df["reviews"].apply(clean_text)

    # Load sentiment model
    sentiment_analyzer = load_sentiment_model()
    if sentiment_analyzer is None:
        return

    # Process reviews
    records = process_reviews(df, sentiment_analyzer)

    # Create final dataframe
    print("\n📋 Creating final dataframe...")
    final_df = create_final_dataframe(df, records)

    # Export results
    output_file = "aspect_sentiment_analysis_results.xlsx"
    final_df.to_excel(output_file, index=False)
    print(f"\n✅ Results exported to: {output_file}")

    # Generate summary
    generate_summary(final_df)

    return final_df, sentiment_analyzer

# ==============================
# TEST FUNCTION
# ==============================
def test_single_review(sentiment_analyzer):
    """Test with a single review"""
    test_review = """I recently purchased a pharmaceutical product from this platform,
    and unfortunately, my experience was disappointing. While the listed price for the product was ₹80,
    the final billing showed an inflated price of ₹100. On top of that, there were additional platform fees,
    which made the overall cost even higher than expected."""

    cleaned_review = clean_text(test_review)
    result = analyze_review_strict(cleaned_review, sentiment_analyzer)

    print("\n" + "="*50)
    print("🧪 TEST REVIEW ANALYSIS")
    print("="*50)
    print(f"\nOriginal Review: {test_review}\n")

    for category, data in result.items():
        if data["keywords"]:
            print(f"\n{category}:")
            print(f"  Keywords: {data['keywords']}")
            print(f"  Sentiment: {data['sentiment']}")
            print(f"  Vector: {data['vector']}")
            print(f"  Confidence: {data['confidence']}")

# ==============================
# RUN THE ANALYSIS
# ==============================
if __name__ == "__main__":
    final_df, sentiment_analyzer = main()

    # Test with single review
    if sentiment_analyzer is not None:
        test_single_review(sentiment_analyzer)

✅ Loaded 1580 reviews successfully

🧹 Cleaning text...


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


✅ Sentiment model loaded successfully

📊 Processing reviews...


Analyzing: 100%|██████████| 1580/1580 [06:20<00:00,  4.15it/s]



📋 Creating final dataframe...

✅ Results exported to: aspect_sentiment_analysis_results.xlsx

📈 SENTIMENT ANALYSIS SUMMARY

Customer Care and Support:
  ✅ Positive: 91 (18.9%)
  ❌ Negative: 391 (81.1%)
  📊 Total mentions: 482

Delivery:
  ✅ Positive: 111 (12.9%)
  ❌ Negative: 749 (87.1%)
  📊 Total mentions: 860

Application Experience:
  ✅ Positive: 153 (14.3%)
  ❌ Negative: 920 (85.7%)
  📊 Total mentions: 1073

Price:
  ✅ Positive: 70 (13.1%)
  ❌ Negative: 463 (86.9%)
  📊 Total mentions: 533

Refund Policy:
  ✅ Positive: 36 (8.7%)
  ❌ Negative: 376 (91.3%)
  📊 Total mentions: 412

Stock Availability:
  ✅ Positive: 10 (3.7%)
  ❌ Negative: 257 (96.3%)
  📊 Total mentions: 267

Product Quality:
  ✅ Positive: 19 (13.5%)
  ❌ Negative: 122 (86.5%)
  📊 Total mentions: 141

Overall Experience:
  ✅ Positive: 160 (19.1%)
  ❌ Negative: 677 (80.9%)
  📊 Total mentions: 837

🧪 TEST REVIEW ANALYSIS

Original Review: I recently purchased a pharmaceutical product from this platform,
    and unfortunat

In [11]:
!pip install streamlit
!pip install pyngrok

Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.50.0
Collecting pyngrok
  Downloading pyngrok-7.4.1-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.1-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.1


In [44]:
# save_and_run.py - Run this to create app.py and launch Streamlit

import os

app_code = '''# app.py
import streamlit as st
from transformers import pipeline
import pandas as pd
import re

st.set_page_config(page_title="Review Analyzer", page_icon="💊", layout="wide")

# Category keywords
CATEGORIES = {
    "Customer Support": ["support", "customer care", "help"],
    "Delivery": ["delivery", "deliver", "delay", "shipping"],
    "App Experience": ["app", "interface", "crash", "bug"],
    "Price": ["price", "cost", "expensive", "discount"],
    "Refund": ["refund", "return", "cancel"],
    "Stock": ["stock", "available", "out of stock"],
    "Quality": ["quality", "genuine", "fake", "expired"],
}

@st.cache_resource
def load_model():
    return pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\\s]", " ", text)
    return re.sub(r"\\s+", " ", text).strip()

def analyze(text, model):
    text = clean_text(text)
    result = {}

    for cat, keywords in CATEGORIES.items():
        found = [kw for kw in keywords if re.search(r'\\b' + kw + r'\\b', text)]

        if found:
            sentiment = model(" ".join(found))[0]
            result[cat] = {
                "sentiment": sentiment["label"],
                "confidence": f"{sentiment['score']:.1%}",
                "keywords": ", ".join(found)
            }

    return result

# Main UI
st.title("💊 Review Sentiment Analyzer")

model = load_model()

tab1, tab2 = st.tabs(["Single Review", "Batch Analysis"])

# Single Review Tab
with tab1:
    review = st.text_area("Enter review:", height=100)

    if st.button("Analyze", type="primary"):
        if review:
            result = analyze(review, model)

            if result:
                st.subheader("Results:")
                for cat, data in result.items():
                    with st.expander(f"{cat} - {data['sentiment']}"):
                        st.write(f"**Confidence:** {data['confidence']}")
                        st.write(f"**Keywords:** {data['keywords']}")
            else:
                st.warning("No aspects detected!")
        else:
            st.warning("Please enter a review")

# Batch Analysis Tab
with tab2:
    file = st.file_uploader("Upload Excel (must have 'Reviews' column)", type=['xlsx'])

    if file and st.button("Start Analysis"):
        df = pd.read_excel(file)
        df.columns = [c.strip().lower() for c in df.columns]

        if 'reviews' not in df.columns:
            st.error("'Reviews' column not found!")
        else:
            progress = st.progress(0)
            results = []

            for i, review in enumerate(df['reviews']):
                progress.progress((i + 1) / len(df))
                res = analyze(review, model)

                row = {"Review": review}
                for cat in CATEGORIES.keys():
                    if cat in res:
                        row[f"{cat}_Sentiment"] = res[cat]["sentiment"]
                        row[f"{cat}_Keywords"] = res[cat]["keywords"]
                    else:
                        row[f"{cat}_Sentiment"] = ""
                        row[f"{cat}_Keywords"] = ""
                results.append(row)

            results_df = pd.DataFrame(results)

            st.success(f"✅ Analyzed {len(df)} reviews!")

            # Summary
            st.subheader("Summary")
            for cat in CATEGORIES.keys():
                col = f"{cat}_Sentiment"
                if col in results_df.columns:
                    pos = (results_df[col] == "POSITIVE").sum()
                    neg = (results_df[col] == "NEGATIVE").sum()
                    if pos + neg > 0:
                        st.write(f"**{cat}:** ✅ {pos} Positive | ❌ {neg} Negative")

            # Download
            st.download_button(
                "📥 Download Results",
                results_df.to_csv(index=False),
                "results.csv",
                "text/csv"
            )

            st.dataframe(results_df.head(10))
'''

# Save to app.py
with open('app.py', 'w', encoding='utf-8') as f:
    f.write(app_code)


In [45]:
!streamlit run app.py &>/content/logs.txt &

In [46]:
from pyngrok import ngrok
ngrok.set_auth_token("2tqLnUwPAU9gjGXqxNYummeJsa5_Hvkmbru17fhPW3pDpP4r")

In [47]:
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print(f"streamlit App is live at: {public_url}")

streamlit App is live at: NgrokTunnel: "https://f9e4bda9fbe4.ngrok-free.app" -> "http://localhost:8501"


In [None]:
!pkill -f ngrok