<a href="https://colab.research.google.com/github/segnig/Amharic-E-commerce-Data-Extractor/blob/task-6/notebooks/task_six.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from datetime import datetime, timedelta
from collections import defaultdict
from transformers import pipeline
import numpy as np

# Load your NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Example vendor_posts dictionary structure
# {
#   "vendor1": [{"text": "...", "views": 1200, "timestamp": datetime}, ...],
#   "vendor2": [...]
# }
def extract_price(entities):
    prices = []
    for ent in entities:
        if "PRICE" in ent["entity_group"]:
            price_text = ent["word"].replace("ብር", "").strip()
            try:
                prices.append(float(price_text))
            except:
                continue
    return prices

def extract_product(entities):
    for ent in entities:
        if "PRODUCT" in ent["entity_group"]:
            return ent["word"]
    return "Unknown"

def analyze_vendor_posts(posts):
    total_views = 0
    total_prices = []
    timestamps = []
    top_post = {"views": 0, "text": "", "product": "", "price": 0}

    for post in posts:
        text = post["text"]
        views = post.get("views", 0)
        timestamp = post.get("timestamp")

        entities = ner_pipeline(text)
        prices = extract_price(entities)
        product = extract_product(entities)

        if views > top_post["views"]:
            top_post = {
                "views": views,
                "text": text,
                "product": product,
                "price": prices[0] if prices else 0
            }

        total_views += views
        total_prices.extend(prices)
        if timestamp:
            timestamps.append(timestamp)

    # --- Metrics ---
    num_posts = len(posts)
    avg_views = total_views / num_posts if num_posts else 0
    avg_price = np.mean(total_prices) if total_prices else 0

    # Posting frequency per week
    if timestamps:
        min_date, max_date = min(timestamps), max(timestamps)
        weeks_active = max((max_date - min_date).days / 7, 1)
        post_per_week = num_posts / weeks_active
    else:
        post_per_week = 0

    lending_score = 0.5 * avg_views + 0.5 * post_per_week

    return {
        "Avg Views/Post": round(avg_views, 2),
        "Posts/Week": round(post_per_week, 2),
        "Avg Price (ETB)": round(avg_price, 2),
        "Lending Score": round(lending_score, 2),
        "Top Product": top_post["product"],
        "Top Post Views": top_post["views"],
        "Top Post Price": top_post["price"]
    }

# Main function to score all vendors
def vendor_scorecard(vendor_posts):
    summary = []

    for vendor, posts in vendor_posts.items():
        metrics = analyze_vendor_posts(posts)
        metrics["Vendor"] = vendor
        summary.append(metrics)

    df = pd.DataFrame(summary)
    df = df[["Vendor", "Avg Views/Post", "Posts/Week", "Avg Price (ETB)", "Lending Score",
             "Top Product", "Top Post Views", "Top Post Price"]]
    return df.sort_values("Lending Score", ascending=False)

# Example usage:
# final_df = vendor_scorecard(vendor_posts)
# display(final_df)
# final_df.to_csv("vendor_scorecard.csv", index=False)
