In [34]:
!pip install rapidfuzz

70019.90s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0


In [None]:
from langchain_community.llms import Ollama
import pandas as pd
import glob
import os
import yaml
from rapidfuzz import process  # modern replacement for fuzzywuzzy

model = "gemma3"  # Specify your model here
# Load the YAML file
with open("data/templates/GHG_templates.yaml", "r") as f:
    templates = yaml.safe_load(f)

data_folder = "data/sample_GHG_data/"
data_files = glob.glob(os.path.join(data_folder, "*.xlsx"))

GHG_categories = list(templates.keys())  # Assuming YAML file has GHG categories as keys
print("GHG Categories:", GHG_categories)

def match_category(input_str, choices, threshold=80):
    """
    Match input string to the closest category.
    Returns best match if similarity >= threshold, else None.
    """
    best_match, score, _ = process.extractOne(input_str, choices)
    return best_match if score >= threshold else None

# Initialize LLM once
ollama = Ollama(
    base_url="http://localhost:11434",
    model=model
)

results = []

for file in data_files[:1]:
    df = pd.read_excel(file)

    # Extract ground truth
    if 'GHGCategory' in df.columns:
        gt_value = str(df['GHGCategory'].iloc[0])
    elif 'GHG Category' in df.columns:
        gt_value = str(df['GHG Category'].iloc[0])
    else:
        gt_value = 'Not GHG-related'

    # Prepare the prompt with sample data
    sample_data = df.head(3).to_string(index=False)

    prompt = f"""
    Here are the first 3 rows of an Excel file:

    {sample_data}

    Analyze this sample data and classify it into one of the following GHG categories:

    {GHG_categories}

    Important rules:
    - Respond with the exact category name from the list above.
    - If none of the categories apply, respond with "Not GHG-related".
    - Do not provide explanations, just the category name.
    """

    # Call LLM
    response = ollama.invoke(prompt).strip()
    print(f"{file} → {response} | Ground Truth: {gt_value}")

    # Compare response with ground truth
    matched = match_category(input_str=response, choices=[gt_value])
    process_template = 'Yes' if matched else 'No'

    # Collect result row
    results.append({
        "Filename": os.path.basename(file),
        "Response": response,
        "Ground Truth": gt_value,
        "process_template": process_template
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save to Excel
output_excel = "output_data/classify_aug20.xlsx"
results_df.to_excel(output_excel, index=False)

print(f"Summary saved to {output_excel}")


GHG Categories: ['Scope_1_Fugitives', 'Scope_2_PurchasedEnergy', 'SScope_3_1_PurchasedGoods&Services_SpendBasedMethod']
data/sample_GHG_data/Scope_1_Direct_Emissions_Fugitives.xlsx → Scope_1_Fugitives | Ground Truth: Scope_1_Fugitives
Summary saved to output_data/classify_aug20.xlsx


# calculator payload mapping

In [None]:
import os
import json
import yaml
import pandas as pd
import ast
from langchain_community.llms import Ollama

model = "gemma3"  # Specify your model here
# Load results Excel
output_excel = pd.read_excel("output_data/classify_aug20.xlsx")

# Load YAML once
with open("data/templates/GHG_templates.yaml", "r") as f:
    templates = yaml.safe_load(f)

data_folder = "data/sample_GHG_data/"
ollama = Ollama(base_url="http://localhost:11434", model=model)

output_template_payload = {}

for idx, row in output_excel.iterrows():
    if row["process_template"] == "Yes":
        ground_truth = row["Ground Truth"]
        filename = row["Filename"]

        payload_template = templates.get(ground_truth, {})
        scope_data = pd.read_excel(os.path.join(data_folder, filename))
        n_rows = len(scope_data)

        # Convert dataframe rows to JSON-style records
        sample_data = scope_data.to_dict(orient="records")

        prompt = f"""
        You are a data processing assistant. 

        I have an Excel file containing **{ground_truth}** data.  
        Below is the payload template for each row:  

        {payload_template}

        Here are all {n_rows} rows from the Excel file as JSON records:
        {sample_data}

        For each row, generate **one dictionary** that fills in the template 
        using values from that row.  

        Return a **Python list of {n_rows} dictionaries**, one per row.  
        Do not add explanations. Do not wrap in code fences.  
        """

        # Get response
        response = ollama.invoke(prompt).strip()

        # Clean up if model still adds code fences
        if response.startswith("```"):
            response = response.strip("`")
            response = response.split("\n", 1)[-1]
            response = response.rsplit("\n", 1)[0]

        # Parse into Python list
        try:
            try:
                parsed = ast.literal_eval(response)
            except Exception:
                parsed = json.loads(response)  # fallback if it's JSON
        except Exception as e:
            print(f"⚠️ Failed to parse response for {filename}: {e}")
            parsed = response

        output_template_payload[filename] = parsed
        print(f"✅ Generated {len(parsed) if isinstance(parsed, list) else 'unknown'} payloads for {filename}")

# Save to JSON
output_json = "output_data/template_payload_aug20_v2.json"
with open(output_json, "w") as f:
    json.dump(output_template_payload, f, indent=4)

print(f"🎯 Saved generated payloads to {output_json}")


⚠️ Failed to parse response for Scope_1_Direct_Emissions_Fugitives.xlsx: Extra data: line 98 column 1 (char 1808)
✅ Generated unknown payloads for Scope_1_Direct_Emissions_Fugitives.xlsx
🎯 Saved generated payloads to output_data/template_payload_aug20_v2.json


In [40]:
output_template_payload

{'Scope_1_Direct_Emissions_Fugitives.xlsx': "```python\n[\n    {'row_uuid': 'testmulti', 'facility_code': 'general insurance', 'invoice_no': '1', 'invoice_date': '2022-01-01', 'start_date': '2022-01-01', 'end_date': '2022-12-31', 'activity_amount': 'Actual', 'unit_id': 'testmulti', 'emission_type': 'activity data type', 'vehicle_fuel': 'vehicle name', 'number_of_vehicle': 'number of vehicles', 'cost': 'cost', 'currency': 'currency', 'supplier': 'supplier', 'description': 'notes', 'tag_id': ['tag1', 'tag2', '...'], 'link': 'evidence_url', 'link_name': 'evidence_name', 'link_note': 'evidence_note', 'Updatetype': 'System'}\n]\n```"}