this is our clustering agent that summarizes similar posts to reduce data noise. It makes cluster of similar posts on the basis of similarity between tags, location, time of day, sentiment

In [None]:
!pip install google-cloud-firestore

In [None]:
#cluster agent
from google.cloud import firestore
from datetime import datetime, timedelta
import pytz
import requests
import json
import uuid

# --- Config ---
DB = firestore.Client()
SOURCE_COLLECTION_NAME = "instagram_events_trial1"
CLUSTER_COLLECTION_NAME = "cl7"

GEMINI_API_KEY = "api"
GEMINI_API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={GEMINI_API_KEY}"

# --- 1. Fetch from Firestore ---
def fetch_data_from_firestore() -> list:
    try:
        fixed_current_time_utc = datetime.fromisoformat('2025-07-20T09:00:00Z').replace(tzinfo=pytz.utc)
        one_hour_ago = fixed_current_time_utc - timedelta(hours=1)
        ist = pytz.timezone('Asia/Kolkata')

        print(f"Query Range (UTC): {one_hour_ago.isoformat()} → {fixed_current_time_utc.isoformat()}")
        print(f"IST Range: {one_hour_ago.astimezone(ist)} → {fixed_current_time_utc.astimezone(ist)}")

        collection_ref = DB.collection(SOURCE_COLLECTION_NAME)
        docs = collection_ref.where("timestamp", ">=", one_hour_ago.isoformat(timespec="seconds")) \
                             .where("timestamp", "<=", fixed_current_time_utc.isoformat(timespec="seconds")) \
                             .stream()

        data = [doc.to_dict() | {"doc_id": doc.id} for doc in docs]
        print(f"Retrieved {len(data)} documents.")
        return data

    except Exception as e:
        print(f"Error fetching from Firestore: {e}")
        return []

# --- 2. Process for Clustering ---
def process_raw_data(raw_data: list) -> list:
    processed_docs = []
    for doc in raw_data:
        doc_id = doc.get('id') or doc.get('doc_id')
        location = doc.get('location', 'unknown').strip()
        timestamp = doc.get('timestamp')
        time_of_day = doc.get('time_of_day', 'unknown').strip()
        tags = [tag.strip().lower() for tag in doc.get('tags', []) if isinstance(tag, str)]

        try:
            date_only = datetime.fromisoformat(timestamp).date().isoformat()
        except:
            date_only = "unknown"

        processed_docs.append({
            "doc_id": doc_id,
            "location": location,
            "timestamp": timestamp,
            "date": date_only,
            "time_of_day": time_of_day,
            "tags": tags
        })

    print(f"Processed {len(processed_docs)} documents.")
    return processed_docs

# --- 3. Clustering Prompt ---
def get_clustering_prompt(data_json: str) -> str:
    return f"""
You are an intelligent clustering assistant. Your task is to analyze a list of event documents
and group them into logical clusters based on their 'location', 'time_of_day', and tags.

Clustering Rules:
1. Group by Location Proximity
2. Time of Day Similarity
3. Event Inference from Tags

Additional Instructions:
- From the 'time_of_day' fields, generate an average range (e.g., "10AM–1PM") and label it `avg_time_of_day`.
- From the `timestamp`, each post contains a 'date' field. Use this to include a `date` field in each cluster.
- Use the most common location among the posts in each cluster and label it as `location`.
- Analyze the tone of the tags and overall message implied in the posts to assign an `overall_sentiment` to each cluster. This can be one of: "positive", "neutral", or "negative".

Also calculate a confidence score for each cluster:
1. Tag overlap score (max 50):
   - For each post, count how many tags match query_tags.
   - Compute average overlap across all posts.
   - Scale: (average_overlap / len(query_tags)) * 50

2. Post count score (max 50):
   - 0 posts → 0
   - 1–4 posts → 15
   - 5–24 posts → 30
   - 25–99 posts → 45
   - 100+ posts → 50

Final confidence = tag_overlap_score + post_count_score (max 100)

Input:
{data_json}

Output Format (JSON only):
[
  {{
    "tags_used_for_clustering": ["tag1", "tag2"],
    "document_ids": ["id1", "id2"],
    "avg_time_of_day": "10AM–1PM",
    "location": "New Delhi",
    "date": "2025-07-25",
    "confidence_score": 87.5,
    "overall_sentiment": "positive"
  }}
]
Only give JSON. No explanation.
"""

# --- 4. Gemini API Call ---
def call_gemini_api_for_clustering(processed_data: list) -> list:
    if not processed_data:
        print("No data to cluster.")
        return []

    prompt_text = get_clustering_prompt(json.dumps(processed_data, indent=2))
    payload = {
        "contents": [{"role": "user", "parts": [{"text": prompt_text}]}],
        "generationConfig": {"responseMimeType": "application/json"}
    }

    try:
        print("Calling Gemini API...")
        response = requests.post(GEMINI_API_URL, headers={"Content-Type": "application/json"}, data=json.dumps(payload))
        response.raise_for_status()
        result = response.json()
        content_parts = result.get("candidates", [])[0].get("content", {}).get("parts", [])
        if not content_parts:
            print("Gemini API returned empty parts.")
            return []

        json_str_response = content_parts[0]["text"]
        clusters = json.loads(json_str_response)
        print("Clustering completed.")
        return clusters if isinstance(clusters, list) else []

    except Exception as e:
        print(f"Error during Gemini API call: {e}")
        return []

# --- 5. Assign UUIDs ---
def assign_uuids_to_clusters(clusters_list: list) -> dict:
    return {str(uuid.uuid4()): cluster_info for cluster_info in clusters_list}

# --- 6. Save to Firestore ---
def save_clusters_to_firestore(clustered_data: dict) -> None:
    try:
        cluster_ref = DB.collection(CLUSTER_COLLECTION_NAME)

        # Clear existing clusters (optional)
        print("Clearing previous cluster data...")
        docs = cluster_ref.stream()
        for doc in docs:
            doc.reference.delete()

        # Insert new clusters
        for cluster_id, cluster_info in clustered_data.items():
            cluster_ref.document(cluster_id).set({
                "tags_used_for_clustering": cluster_info.get("tags_used_for_clustering", []),
                "document_ids": cluster_info.get("document_ids", []),
                "avg_time_of_day": cluster_info.get("avg_time_of_day", "unknown"),
                "location": cluster_info.get("location", "unknown"),
                "date": cluster_info.get("date", "unknown"),
                "confidence_score": cluster_info.get("confidence_score", 0.0),
                "overall_sentiment": cluster_info.get("overall_sentiment", "unknown")
            })

        print(f"Inserted {len(clustered_data)} clusters into Firestore '{CLUSTER_COLLECTION_NAME}'.")

    except Exception as e:
        print(f"Failed to save clusters to Firestore: {e}")

# --- Main ---
if __name__ == "__main__":
    raw_documents = fetch_data_from_firestore()

    if raw_documents:
        processed_data = process_raw_data(raw_documents)
        raw_clustered_list = call_gemini_api_for_clustering(processed_data)

        if raw_clustered_list:
            clustered_results_with_uuids = assign_uuids_to_clusters(raw_clustered_list)
            save_clusters_to_firestore(clustered_results_with_uuids)

            print("\n--- Final Cluster Summary ---")
            for cluster_id, cluster_info in clustered_results_with_uuids.items():
                print(f"Cluster ID: {cluster_id}")
                print(f"  Tags: {cluster_info.get('tags_used_for_clustering')}")
                print(f"  Docs: {cluster_info.get('document_ids')}")
                print(f"  Avg Time: {cluster_info.get('avg_time_of_day', 'N/A')}")
                print(f"  Location: {cluster_info.get('location', 'N/A')}")
                print(f"  Date: {cluster_info.get('date', 'N/A')}")
                print(f"  Confidence: {cluster_info.get('confidence_score')}")
                print(f"  Sentiment: {cluster_info.get('overall_sentiment')}")
                print("-" * 40)
        else:
            print("No clusters returned.")
    else:
        print("No documents found for the time range.")
