In [1]:
import pandas as pd

In [None]:
# df=pd.read_csv('combined_comments.csv')
# df

In [3]:
df = pd.read_csv("comment_data_with_sentimentam.csv")
df

Unnamed: 0,Document ID,Comment,Sentiment
0,CDC-2025-0454-0002,The new restrictions on COVID vaccines only ma...,Positive
1,CDC-2025-0454-0003,Please follow the science and not politics. Th...,Positive
2,CDC-2025-0454-0004,"Each year on July 28, we come together to reco...",Positive
3,CDC-2025-0454-0005,The ACIP must uphold evidence-based immunizati...,Positive
4,CDC-2025-0454-0006,Vaccines are proven science. The hysteria that...,Positive
...,...,...,...
28751,ED-2025-OPE-0016-7220,"Secretary of Education Linda McMahon,\n\nDear ...",Negative
28752,ED-2025-OPE-0016-DRAFT-7210,Hello.\n\nAs a public servant who works with A...,Negative
28753,ED-2025-OPE-0016-DRAFT-7211,Thank you for the opportunity to present. I am...,Positive
28754,ED-2025-OPE-0016-DRAFT-7212,"To Whom it may concern,\n\nStudent loans had b...",Negative


In [4]:
import pandas as pd
import requests
import time
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
import tiktoken
import os
import warnings
warnings.filterwarnings('ignore')

# --- Setup Environment Variable Automatically ---
os.environ["GROQ_API_KEY"] = "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"

# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

# --- Load Dataset ---
try:
    df = pd.read_csv("comment_data_by_docket_tfidf_top5.csv")
    df['Comment'] = df['Comment'].fillna('').astype(str)
    print(f"✅ Dataset loaded: {len(df)} rows")
    print(f"📊 Sentiments: {df['Sentiment'].value_counts().to_dict()}")
except FileNotFoundError:
    print("❌ Error: comment_data_by_docket_tfidf_top5.csv not found")
    exit()

# --- API Config ---
API_URL = "https://api.groq.com/openai/v1/chat/completions"
API_KEY = os.getenv("GROQ_API_KEY")
if not API_KEY:
    raise ValueError("❌ Please set your GROQ_API_KEY environment variable")

HEADERS = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}

# --- Tokenizer ---
enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


class HybridSummarizer:
    def __init__(self):
        self.api_calls_made = 0

    def count_tokens(self, text):
        return len(enc.encode(text))

    def create_token_batches(self, comments, max_tokens=2000):
        """Split comments into batches based on token count"""
        batches, current_batch, current_tokens = [], [], 0
        for comment in comments:
            tokens = self.count_tokens(comment)
            if current_tokens + tokens > max_tokens and current_batch:
                batches.append(current_batch)
                current_batch, current_tokens = [], 0
            current_batch.append(comment)
            current_tokens += tokens
        if current_batch:
            batches.append(current_batch)
        return batches

    def extract_key_sentences(self, comments, top_k=40):
        all_sentences = []
        for comment in comments:
            try:
                sentences = sent_tokenize(str(comment))
                meaningful = [s.strip() for s in sentences if len(s.split()) >= 8]
                all_sentences.extend(meaningful)
            except Exception:
                continue

        if len(all_sentences) <= top_k:
            return all_sentences

        try:
            vectorizer = TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2), min_df=2)
            tfidf_matrix = vectorizer.fit_transform(all_sentences)
            scores = np.array(tfidf_matrix.sum(axis=1)).flatten()
            top_idx = scores.argsort()[-top_k:][::-1]
            return [all_sentences[i] for i in top_idx]
        except Exception as e:
            print(f"  ⚠️ TF-IDF failed: {e}")
            return all_sentences[:top_k]

    def create_extractive_summary(self, comments, top_k=40):
        key_sentences = self.extract_key_sentences(comments, top_k=top_k)
        if not key_sentences:
            return "No meaningful content."
        try:
            if len(key_sentences) > 10:
                vectorizer = TfidfVectorizer(stop_words='english', max_features=100)
                sentence_vectors = vectorizer.fit_transform(key_sentences)
                n_clusters = min(5, len(key_sentences) // 5)
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                clusters = kmeans.fit_predict(sentence_vectors)

                clustered = {}
                for i, cid in enumerate(clusters):
                    clustered.setdefault(cid, []).append(key_sentences[i])

                parts = []
                for cid, sentences in clustered.items():
                    parts.append(" ".join(sentences[:3]))
                return " ".join(parts)
            else:
                return " ".join(key_sentences)
        except Exception as e:
            print(f"  ⚠️ Clustering failed: {e}")
            return " ".join(key_sentences[:20])

    def polish_with_api(self, text, sentiment_name, total_comments, max_chunk_tokens=2000, max_retries=5):
        """Polish summary with Groq API using token-based chunks"""
        if len(text) < 100:
            return text

        polished_parts = []
        sentences = sent_tokenize(text)
        current_chunk, current_tokens = [], 0

        for sentence in sentences:
            tokens = self.count_tokens(sentence)
            if current_tokens + tokens > max_chunk_tokens and current_chunk:
                polished_parts.append(self._api_request(" ".join(current_chunk), sentiment_name, total_comments, max_retries))
                current_chunk, current_tokens = [], 0
            current_chunk.append(sentence)
            current_tokens += tokens

        if current_chunk:
            polished_parts.append(self._api_request(" ".join(current_chunk), sentiment_name, total_comments, max_retries))

        return "\n".join(polished_parts)

    def _api_request(self, text_chunk, sentiment_name, total_comments, max_retries):
        retries = 0
        while retries < max_retries:
            try:
                prompt = f"""
Here is a part of the summary for {total_comments} {sentiment_name.lower()} sentiment comments:

{text_chunk}

Please polish this part into a structured summary with clear professional language.
"""
                payload = {
                    "model": "llama-3.3-70b-versatile",
                    "messages": [
                        {"role": "system", "content": "You are a professional summarizer."},
                        {"role": "user", "content": prompt}
                    ],
                    "temperature": 0.2,
                    "max_tokens": 800
                }

                response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=60)
                response.raise_for_status()
                result = response.json()
                self.api_calls_made += 1
                return result['choices'][0]['message']['content'].strip()
            except requests.exceptions.HTTPError as e:
                if response.status_code == 429:  # rate limit
                    wait_time = 5 + retries * 2
                    print(f"  ⚠️ API limit reached. Waiting {wait_time}s before retry...")
                    time.sleep(wait_time)
                    retries += 1
                else:
                    print(f"  ⚠️ API polish failed: {e}")
                    return text_chunk
            except Exception as e:
                print(f"  ⚠️ API polish failed: {e}")
                return text_chunk
        return text_chunk

    def summarize_sentiment(self, group, sentiment_name):
        comments = group['Comment'].tolist()
        total_chars = sum(len(c) for c in comments)
        total_tokens = sum(self.count_tokens(c) for c in comments)
        print(f"🎯 {sentiment_name}: {len(comments)} comments, {total_chars} chars, {total_tokens} tokens")

        # Step 1: extractive summary ~10% of total characters
        top_chars_count = max(1, total_chars // 10)
        extractive_summary = self.create_extractive_summary(comments, top_k=top_chars_count)
        print(f"  🔹 Extractive summary length: {len(extractive_summary)} chars (~10%)")

        # Step 2: batching for hierarchical extractive summary
        batches = self.create_token_batches(comments, max_tokens=2000)
        print(f"  🔹 Split into {len(batches)} token-based batches")

        batch_summaries = []
        for i, batch in enumerate(batches, 1):
            batch_summary = self.create_extractive_summary(batch, top_k=40)
            batch_summaries.append(batch_summary)
            print(f"    ✅ Batch {i} summary: {len(batch_summary)} chars")

        merged_summary = " ".join(batch_summaries)
        print(f"  🔸 Merged length: {len(merged_summary)} chars")

        # Step 3: polish with API using token chunks
        final_summary = self.polish_with_api(merged_summary, sentiment_name, len(comments))
        print(f"  🎨 Final summary: {len(final_summary)} chars")

        return {
            "Sentiment": sentiment_name,
            "Total_Comments": len(comments),
            "Total_Characters": total_chars,
            "Total_Tokens": total_tokens,
            "Extractive_10pct": extractive_summary,
            "Final_Summary": final_summary
        }


# --- Main ---
def main():
    summarizer = HybridSummarizer()
    results = []

    print(f"\n🚀 Starting hierarchical summarization...")
    for sentiment, group in tqdm(df.groupby('Sentiment'), desc="Processing Sentiments"):
        summary_data = summarizer.summarize_sentiment(group, sentiment)
        results.append(summary_data)

    out_df = pd.DataFrame(results)
    out_df.to_csv("hierarchical_sentiment_summaries.csv", index=False)

    print("\n✅ Summarization complete!")
    print(f"📞 API calls: {summarizer.api_calls_made}")
    print("📁 Output saved: hierarchical_sentiment_summaries.csv")
    return out_df


if __name__ == "__main__":
    results = main()
 

✅ Dataset loaded: 1436 rows
📊 Sentiments: {'Neutral': 497, 'Negative': 478, 'Positive': 461}

🚀 Starting hierarchical summarization...


Processing Sentiments:   0%|          | 0/3 [00:00<?, ?it/s]

🎯 Negative: 478 comments, 1734608 chars, 314543 tokens
  🔹 Extractive summary length: 2768 chars (~10%)
  🔹 Split into 191 token-based batches
    ✅ Batch 1 summary: 3334 chars
    ✅ Batch 2 summary: 3194 chars
    ✅ Batch 3 summary: 3116 chars
    ✅ Batch 4 summary: 3275 chars
    ✅ Batch 5 summary: 3280 chars
    ✅ Batch 6 summary: 3111 chars
    ✅ Batch 7 summary: 3455 chars
    ✅ Batch 8 summary: 3349 chars
    ✅ Batch 9 summary: 2862 chars
    ✅ Batch 10 summary: 2571 chars
    ✅ Batch 11 summary: 3716 chars
    ✅ Batch 12 summary: 3481 chars
    ✅ Batch 13 summary: 2860 chars
    ✅ Batch 14 summary: 2707 chars
    ✅ Batch 15 summary: 3130 chars
    ✅ Batch 16 summary: 2846 chars
    ✅ Batch 17 summary: 2994 chars
    ✅ Batch 18 summary: 3056 chars
    ✅ Batch 19 summary: 3051 chars
    ✅ Batch 20 summary: 2798 chars
    ✅ Batch 21 summary: 3192 chars
    ✅ Batch 22 summary: 2825 chars
    ✅ Batch 23 summary: 3272 chars
    ✅ Batch 24 summary: 3012 chars
    ✅ Batch 25 summary: 30

Processing Sentiments:  33%|███▎      | 1/3 [41:45<1:23:31, 2505.89s/it]

  🎨 Final summary: 573943 chars
🎯 Neutral: 497 comments, 275782 chars, 53386 tokens
  🔹 Extractive summary length: 2387 chars (~10%)
  🔹 Split into 29 token-based batches
    ✅ Batch 1 summary: 3261 chars
    ✅ Batch 2 summary: 3038 chars
    ✅ Batch 3 summary: 2925 chars
    ✅ Batch 4 summary: 4094 chars
    ✅ Batch 5 summary: 3472 chars
    ✅ Batch 6 summary: 3532 chars
    ✅ Batch 7 summary: 2860 chars
    ✅ Batch 8 summary: 2872 chars
    ✅ Batch 9 summary: 3405 chars
    ✅ Batch 10 summary: 2491 chars
    ✅ Batch 11 summary: 2266 chars
    ✅ Batch 12 summary: 2594 chars
    ✅ Batch 13 summary: 3137 chars
    ✅ Batch 14 summary: 1760 chars
    ✅ Batch 15 summary: 1787 chars
    ✅ Batch 16 summary: 3040 chars
    ✅ Batch 17 summary: 2547 chars
    ✅ Batch 18 summary: 2047 chars
    ✅ Batch 19 summary: 2392 chars
    ✅ Batch 20 summary: 3193 chars
    ✅ Batch 21 summary: 2480 chars
    ✅ Batch 22 summary: 1945 chars
    ✅ Batch 23 summary: 2481 chars
    ✅ Batch 24 summary: 2444 char

Processing Sentiments:  67%|██████▋   | 2/3 [47:18<20:27, 1227.58s/it]  

  🎨 Final summary: 73693 chars
🎯 Positive: 461 comments, 1956181 chars, 370309 tokens
  🔹 Extractive summary length: 3078 chars (~10%)
  🔹 Split into 215 token-based batches
    ✅ Batch 1 summary: 3724 chars
    ✅ Batch 2 summary: 3821 chars
    ✅ Batch 3 summary: 3811 chars
    ✅ Batch 4 summary: 3824 chars
    ✅ Batch 5 summary: 3816 chars
    ✅ Batch 6 summary: 3843 chars
    ✅ Batch 7 summary: 3725 chars
    ✅ Batch 8 summary: 3659 chars
    ✅ Batch 9 summary: 4002 chars
    ✅ Batch 10 summary: 3944 chars
    ✅ Batch 11 summary: 3709 chars
    ✅ Batch 12 summary: 3780 chars
    ✅ Batch 13 summary: 3816 chars
    ✅ Batch 14 summary: 3703 chars
    ✅ Batch 15 summary: 3824 chars
    ✅ Batch 16 summary: 3824 chars
    ✅ Batch 17 summary: 3834 chars
    ✅ Batch 18 summary: 3824 chars
    ✅ Batch 19 summary: 3740 chars
    ✅ Batch 20 summary: 3818 chars
    ✅ Batch 21 summary: 3784 chars
    ✅ Batch 22 summary: 3824 chars
    ✅ Batch 23 summary: 3514 chars
    ✅ Batch 24 summary: 3824 c

Processing Sentiments: 100%|██████████| 3/3 [1:49:20<00:00, 2186.69s/it]

  🎨 Final summary: 784671 chars

✅ Summarization complete!
📞 API calls: 4
📁 Output saved: hierarchical_sentiment_summaries.csv





In [5]:
# Calculate total characters for each sentiment
df['Comment_Length'] = df['Comment'].astype(str).apply(len)

# Group by sentiment and sum the characters
char_summary = df.groupby('Sentiment')['Comment_Length'].sum().reset_index()

char_summary.rename(columns={'Comment_Length': 'Total_Characters'}, inplace=True)

print(char_summary)


  Sentiment  Total_Characters
0  Negative           1734608
1   Neutral            275782
2  Positive           1956181


In [6]:
dfs= pd.read_csv("hierarchical_sentiment_summaries.csv")
dfs

Unnamed: 0,Sentiment,Total_Comments,Total_Characters,Total_Tokens,Extractive_10pct,Final_Summary
0,Negative,478,1734608,314543,ICEB-2025-0001: A Counterproductive Policy Har...,Unnecessary and Overly Broad Solution to Perce...
1,Neutral,497,275782,53386,I appreciate the Department’s goal of clarity ...,in Computer Science to enroll in a second mast...
2,Positive,461,1956181,370309,As an infectious diseases health care professi...,It is critical that the Centers for Disease Co...
