In [None]:
import pandas as pd
import re
import json
import requests
import logging
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import html

# ---------- Setup Logging ----------
logging.basicConfig(filename='parse_errors_5_1.log', level=logging.ERROR, format='%(asctime)s - %(message)s')

# ---------- Load Dataset ----------
file_path = '/Users/nonny/Downloads/Remove Disagreement Version(2).xlsx'
df = pd.read_excel(file_path).rename(columns={'Column1': 'ID'})

df_positive = df[['ID', 'PositiveReview']].rename(columns={'PositiveReview': 'Review'})
df_negative = df[['ID', 'NegativeReview']].rename(columns={'NegativeReview': 'Review'})
df_positive = df_positive[df_positive['Review'].notna() & (df_positive['Review'].str.strip() != "")]
df_negative = df_negative[df_negative['Review'].notna() & (df_negative['Review'].str.strip() != "")]
df_combined = pd.concat([df_positive, df_negative], ignore_index=True).drop_duplicates()
df_combined = df_combined.sort_values(by='ID').reset_index(drop=True)
# df_combined = df_combined.head(100)
print(f" Loaded {len(df_combined)} reviews")

# ---------- Prompt & Helper Functions ----------
def build_example_shots():
    return [
        {"role": "user", "content": "The room is enough big. But the room was a little bit dirty."},
        {"role": "assistant", "content": '{"Topics":[{"Room1":[{"text": "The room is enough big.","label": "Positive"}],'
                                         '"Room2":[{"text": "the room was a little bit dirty.","label": "Negative"}],'
                                         '"Staff":[{"text": null,"label": null}],'
                                         '"Location":[{"text": null,"label": null}],'
                                         '"Food":[{"text": null,"label": null}],'
                                         '"Price":[{"text": null,"label": null}],'
                                         '"Facility":[{"text": null,"label": null}],'
                                         '"Check-in":[{"text": null,"label": null}],'
                                         '"Check-out":[{"text": null,"label": null}],'
                                         '"Taxi-issue":[{"text": null,"label": null}],'
                                         '"Booking-issue":[{"text": null,"label": null}],'
                                         '"Off":[{"text": null,"label": null}]}]}'
        },

        # Example 2
        {"role": "user", "content": "The room was very clean, well decorated and modern, although not big. It was cheap."},
        {"role": "assistant", "content": '{"Topics":[{"Room1":[{"text": "The room was very clean, well decorated and modern","label": "Positive"}],'
                                         '"Room2":[{"text": "although not big","label": "Negative"}],'
                                         '"Price":[{"text": "cheap","label": "Positive"}],'
                                         '"Staff":[{"text": null,"label": null}],'
                                         '"Location":[{"text": null,"label": null}],'
                                         '"Food":[{"text": null,"label": null}],'
                                         '"Facility":[{"text": null,"label": null}],'
                                         '"Check-in":[{"text": null,"label": null}],'
                                         '"Check-out":[{"text": null,"label": null}],'
                                         '"Taxi-issue":[{"text": null,"label": null}],'
                                         '"Booking-issue":[{"text": null,"label": null}],'
                                         '"Off":[{"text": null,"label": null}]}]}'
        },

        # Example 3
        {"role": "user", "content": "Location. The hotel was new and close to the airport, which made traveling easy. However, there was a lot of street noise outside the window. Staff. The receptionist was polite and friendly. However, check-in took longer than expected. The hotel lobby was welcoming and spacious. The room had a comfortable bed, but the air conditioning was loud at night. The neighbors were noisy through the walls, and the WiFi in the room was weak and unreliable. The breakfast buffet was delicious; however, the coffee was terrible. The price was reasonable for the quality. The building was charming with historical architecture."},
        {"role": "assistant", "content": '{"Topics":[{"Room1":[{"text": "The room had a comfortable bed.","label": "Positive"}],'
                               '"Room2":[{"text": "The air conditioning was loud at night.","label": "Negative"}],'
                               '"Room3":[{"text": "The neighbors were noisy through the walls.","label": "Negative"}],'
                               '"Room4":[{"text": "the WiFi in the room was weak and unreliable.","label": "Negative"}],'
                               '"Staff":[{"text": "The receptionist was polite and friendly.","label": "Positive"}],'
                               '"Location1":[{"text": "close to the airport, which made traveling easy.","label": "Positive"}],'
                               '"Location2":[{"text": "there was a lot of street noise outside the window.","label": "Negative"}],'
                               '"Food1":[{"text": "The breakfast buffet was delicious.","label": "Positive"}],'
                               '"Food2":[{"text": "the coffee was terrible.","label": "Negative"}],'
                               '"Price":[{"text": "The price was reasonable for the quality.","label": "Positive"}],'
                               '"Facility1":[{"text": "The hotel was new.","label": "Positive"}],'
                               '"Facility2":[{"text": "The hotel lobby was welcoming and spacious.","label": "Positive"}],'
                               '"Facility3":[{"text": "The building was charming with historical architecture.","label": "Positive"}],'
                               '"Check-in":[{"text": "check-in took longer than expected.","label": "Negative"}],'
                               '"Check-out":[{"text": null,"label": null}],'
                               '"Taxi-issue":[{"text": null,"label": null}],'
                               '"Booking-issue":[{"text": null,"label": null}],'
                               '"Off":[{"text": "Location. Staff.","label": null}]}]}'
        },
          # Example 4
        {"role": "user", "content": "location, service, overall was good, Sure worth it to come back again"},
        {"role": "assistant", "content": '{"Topics":[{"Room":[{"text": null,"label": null}],'
                                '"Staff":[{"text": null,"label": null}],'
                                '"Location":[{"text": null,"label": null}],'
                                '"Food":[{"text": null,"label": null}],'
                                '"Price":[{"text": null,"label": null}],'
                                '"Facility":[{"text": null,"label": null}],'
                                '"Check-in":[{"text": null,"label": null}],'
                                '"Check-out":[{"text": null,"label": null}],'
                                '"Taxi-issue":[{"text": null,"label": null}],'
                                '"Booking-issue":[{"text": null,"label": null}],'
                                '"Off":[{"text": "location, service, overall was good, Sure worth it to come back again","label": "Null"}]}]}'
        },
         # Example 5
        {"role": "user", "content": "The apartment was new. The breakfast was amazing and the price was quite reasonable. Overall, we definitely planning to return again soon!"},
        {"role": "assistant", "content": '{"Topics":[{"Room":[{"text": null,"label": null}],'
                                '"Staff":[{"text": null,"label": null}],'
                                '"Location":[{"text": null,"label": null}],'
                                '"Food":[{"text": "The breakfast was amazing and the price was quite reasonable.","label": "Positive"}],'
                                '"Price":[{"text": null,"label": null}],'
                                '"Facility":[{"text": "The apartment was new.","label": "Positive"}],'
                                '"Check-in":[{"text": null,"label": null}],'
                                '"Check-out":[{"text": null,"label": null}],'
                                '"Taxi-issue":[{"text": null,"label": null}],'
                                '"Booking-issue":[{"text": null,"label": null}],'
                                '"Off":[{"text": null,"label": null}]}]}'
        },
        
    ]

def build_prompt(review_text):
    system_prompt = (
        'Please output the following [text] according to the [constraints] in the [output format].\n '
        '[constraints]* The output should only be in the [output format], and you must classify which part of the text corresponds to which Topic in the [Topics]. '
        'Additionally, determine whether each classified element is Positive or Negative. If there is no corresponding element, put Null for both `text` and `label`. '
        'No extra characters like newline, `json`, or backticks outside the [output format]. '
        'If there are multiple elements for the same Topic, number them. '
        'If they have the same label, group them together.* \n '
        '[Topics] Room, Staff, Location, Food, Price, Facility, Check-in, Check-out, Taxi-issue, Booking-issue, Off \n\n '
        '[output format] '
        '{"Topics":[{"Room":[{"text": "test","label": "Positive"}],'
        '"Staff":[{"text": null,"label": null}],"Location":[{"text": "test","label": "Positive"}],'
        '"Food":[{"text": "test","label": "Negative"}],"Price":[{"text": "test","label": "Positive"}],'
        '"Facility":[{"text": "test","label": "Negative"}],"Check-in":[{"text": "test","label": "Positive"}],'
        '"Check-out":[{"text": null,"label": null}],"Taxi-issue":[{"text": null,"label": null}],'
        '"Booking-issue":[{"text": null,"label": null}],"Off":[{"text": null,"label": null}]}]}'
    )
    messages = [{"role": "system", "content": system_prompt}] + build_example_shots() + [{"role": "user", "content": review_text}]
    return "\n\n".join([f"{m['role'].capitalize()}: {m['content']}" for m in messages])

def send_request(prompt):
    try:
        response = requests.post(
            "http://localhost:11434/api/generate",
            json={
                "model": "llama3:8b",
                "prompt": prompt,
                "stream": False,
                "options": {"temperature": 0, "top_p": 0.05}
            },
            # timeout=180
        )
        return response.json().get('response', '')
    except Exception as e:
        logging.error(f"[Request Error]: {e}\nPrompt:\n{prompt}")
        return ""

def try_parse_response(response_text):
    try:
        response_text = response_text.replace('\n', '').replace('\t', '').strip()
        response_text = re.sub(r',\s*([}\]])', r'\1', response_text)
        match = re.search(r'\{.*\}', response_text)
        if not match:
            raise ValueError("No JSON object found")
        return json.loads(match.group(0))
    except Exception:
        return None

def extract_entries(review_id, full_prompt, response_text):
    entries = []
    parsed = try_parse_response(response_text)
    
    if not parsed or "Topics" not in parsed:
        logging.error(f"[Parse Error] ID {review_id} failed to parse.\nResponse:\n{response_text}")
        return []  # Don't append to final_results if it's broken
    
    for topic_group in parsed["Topics"]:
        for topic, value_list in topic_group.items():
            for val in value_list:
                entries.append({
                    "ID": review_id,
                    "FullPrompt": full_prompt,
                    "Topics": topic,
                    "Text": val.get("text") or "",
                    "NegPos": val.get("label") or ""
                })
    return entries

def process_review(row):
    review_text, review_id = row['Review'], row['ID']
    print(f"[Processing] ID {review_id}...", flush=True)
    full_prompt = build_prompt(review_text)
    response_content = send_request(full_prompt)
    return extract_entries(review_id, full_prompt, response_content)

# ---------- Run ----------
num_runs = 3
batch_size = 100
output_file = f"ABA_dataset_2_llama3_5_shot_{num_runs}.csv"

if os.path.exists(output_file):
    df_existing = pd.read_csv(output_file)
    processed_ids = set(df_existing["ID"].unique())
    final_results = df_existing.to_dict(orient="records")
    print(f" Resuming from {output_file} with {len(processed_ids)} already processed.")
else:
    processed_ids = set()
    final_results = []

# Check for missing reviews (ID from df_combined)
df_to_process = df_combined[~df_combined["ID"].isin(processed_ids)]
if df_to_process.empty:
    print(f" All reviews already processed. Skipping.")
else:
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_review, row) for _, row in df_to_process.iterrows()]
        for i, future in enumerate(tqdm(as_completed(futures), total=len(futures), desc="Recovering Missing IDs")):
            result = future.result()
            final_results.extend(result)

            if len(final_results) % batch_size == 0:
                pd.DataFrame(final_results).to_csv(output_file, index=False)
                print(f" Saved {len(final_results)} rows so far to {output_file}...")

    pd.DataFrame(final_results).to_csv(output_file, index=False)
    print(f"Recovery complete. Total saved: {len(final_results)} rows → {output_file}")
