In [None]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import LeaveOneGroupOut
from scipy.stats import hmean
from tqdm import tqdm
import pandas as pd
import re
import time
import os
import openai
from openai.error import RateLimitError, APIConnectionError, InvalidRequestError

#### GPT-4 configuration

In [None]:
azure_endpoint = "https://rumi-gpt4.openai.azure.com/"
api_key = ""
api_version = "2024-02-15-preview"
openai.api_type = "azure"
openai.api_base = azure_endpoint
openai.api_key = api_key
openai.api_version = api_version

In [None]:
model_name = "jinaai/jina-embeddings-v2-base-en"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to generate embeddings for a given text
def get_embedding(model, tokenizer, text, max_length=8192):
    """
    Generates embeddings for the given text using a transformer-based model.
    """
    tokens = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**tokens)
        # Average the last hidden state across token dimensions to create the embedding
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().tolist()
    return embedding

In [None]:
def clean_text(text):
    cleaned_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', cleaned_text).strip()

In [None]:
def load_reviews_data(datapath, debugging=False):
    """
    Loads data from the provided Excel file and formats it for processing.
    """
    reviews_df = pd.read_excel(datapath)
    if debugging:
        print("Debugging Mode: Using few document samples only")
        reviews_df = reviews_df.head(5)
    
    data = []
    with tqdm(total=len(reviews_df), desc="Processing reviews") as pbar:
        for _, row in reviews_df.iterrows():
            data.append(
                {
                    "raw_chunk": f"{row['split_decomposed_review']}\n\nClassification: {row['manual_ata_classification']}\nAtypical Aspects: {row['manual_ata_extractive']}",
                    "name": row['name'],  # Debugging field
                    "business_id": row['business_id'],  # Debugging field
                    "review": row['review'],
                    "d_review": row['decomposed_review'],
                    "split_decomposed_review": row['split_decomposed_review'],
                    "manual_ata_label": row['manual_ata_classification'],
                    "manual_ext_aspects": row['manual_ata_extractive'],
                    "true_strong_weak": row['true_strong_weak'],
                    #"true_strong": row['true_strong'],
                    "abs_true_strong_weak": row['abs_true_strong_weak'],
                    "doc_id": row.get("doc_id", None),
                    "chunk_id": row.get("chunk_id", None),
                    "original_index": row.get("original_index", None),
                }
            )
            pbar.update(1)
    return data

def process_results_to_excel(results, output_file):
    """
    Process the results and save them to an Excel file.

    Parameters:
        results (list): List of dictionaries with prediction results.
        output_file (str): Path to save the output Excel file.
    """
    corrected_results = []

    for r in results:
        if all(key in r for key in ['name', 'business_id', 'review', 'split_decomposed_review', 'abs_true_strong_weak', 'predicted_output']):
            # Extract and clean the "Atypical aspects"
            atypical_aspects = r['predicted_output']
            if "Atypical:" in atypical_aspects:
                # Extract the part after "Atypical:" and strip whitespace
                atypical_aspects = atypical_aspects.split("Atypical:")[-1].strip()

            # Append the cleaned result
            corrected_results.append({
                'name': r['name'],
                'business_id': r['business_id'],
                'review': r['review'],
                'split_decomposed_review': r['split_decomposed_review'],
                'abs_true_strong_weak': r['abs_true_strong_weak'],
                'atypical_aspects': atypical_aspects,  # Cleaned output
            })
        else:
            print(f"Skipping incorrect entry: {r}")

    results_df = pd.DataFrame(corrected_results)

    results_df.to_excel(output_file, index=False)
    print(f"Data successfully exported to {output_file}")

In [None]:
def load_prompt_params(review_sentence):
    
    """
    Constructs the query-specific string for the prompt.
    """
    prompt_params = f"""
    
    {review_sentence}' 

    Atypical:
    """
    return prompt_params

### GPT-4

In [None]:
def generate_model_response(prompt, prompt_params, test_instance):
    try:
        messages = [
            {"role": "system", "content": "You are a helpful AI assistant"},
            {"role": "user", "content": prompt},
            {"role": "user", "content": prompt_params}
        ]
        
        formatted_messages = "\n".join([f"{message['role']}: {message['content']}" for message in messages])
        print("Messages being sent to GPT:\n", formatted_messages)
        
        response = openai.ChatCompletion.create(
            messages=messages,
            engine="gpt-4",
            temperature=0,
            frequency_penalty=0,
            presence_penalty=0,
            max_tokens=100
        )
        result_content = response['choices'][0]['message']['content']
        return {
            "name": test_instance['name'],
            "business_id": test_instance['business_id'],
            "review": test_instance['review'],
            "split_decomposed_review": test_instance['split_decomposed_review'],
            "abs_true_strong_weak": test_instance['abs_true_strong_weak'],
            "predicted_output": result_content,
        }
    
    except (InvalidRequestError, RateLimitError, APIConnectionError) as e:
        print(f"Error encountered: {e}")
        time.sleep(150) if isinstance(e, RateLimitError) else None
    except Exception as e:
        print("An unexpected error occurred:", e)
    return None

### Zero-Shot

In [None]:
def load_prompt():
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/extracting/prompt_for_extractive_ata_classification_restaurants_zs.txt', 'r') as file:
        prompt = file.read()
        
    return prompt

def zero_shot_inference(data):
    """
    Perform zero-shot inference using all data without any cross-validation.
    """
    print("Performing zero-shot inference using all data")
    system_labels = []

    for test_instance in data:
        test_sentence = test_instance['split_decomposed_review']

        prompt = load_prompt()
        prompt_params = load_prompt_params(test_sentence)

        system_label = generate_model_response(prompt, prompt_params, test_instance)

        if system_label:
            print(system_label['predicted_output'])
            system_labels.append(system_label)

    return system_labels

### Fixed Few-Shot

In [None]:
def load_prompt_fs():
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/extracting/prompt_for_extractive_ata_classification_restaurants_fs.txt', 'r') as file:
        prompt = file.read()
        
        with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/extracting/few_shot_for_extractive_ata_classification_restaurants.txt', 'r') as fewshot_file:
            few_shot_examples = fewshot_file.read()
        
    return prompt.format(few_shot_examples=few_shot_examples)

def fixed_few_shot_inference(data):
    """
    Perform zero-shot inference using all data without any cross-validation.
    """
    print("Performing zero-shot inference using all data")
    system_labels = []

    for test_instance in data:
        test_sentence = test_instance['split_decomposed_review']

        prompt = load_prompt_fs()
        prompt_params = load_prompt_params(test_sentence)

        system_label = generate_model_response(prompt, prompt_params, test_instance)

        if system_label:
            print(system_label['predicted_output'])
            system_labels.append(system_label)

    return system_labels

### RAG Few-Shot w/ labelwise examples

In [None]:
def precompute_embeddings_and_index(data, model, tokenizer):
    grouped_data = {"<pos>": [], "<neg>": []}
    seen_sentences = {"<pos>": set(), "<neg>": set()}  # Track duplicates

    for item in data:
        sentence = item['split_decomposed_review'].strip()
        label = item.get('manual_ata_label')

        if sentence and label in grouped_data:
            if sentence not in seen_sentences[label]:  # Avoid duplicates
                embedding = get_embedding(model, tokenizer, clean_text(sentence))
                item['sentence_embedding'] = embedding
                grouped_data[label].append(item)
                seen_sentences[label].add(sentence)

    # Debugging: Print the number of examples per label
    print(f"Precomputed {len(grouped_data['<pos>'])} <pos> examples and {len(grouped_data['<neg>'])} <neg> examples.\n")

    return grouped_data

def calculate_top_k_examples(test_sentence, grouped_data, model, tokenizer, top_k=4):
    """
    Calculate top-k similar examples for both <pos> and <neg> labels separately.
    """
    test_embedding = get_embedding(model, tokenizer, clean_text(test_sentence))
    top_k_results = {"<pos>": [], "<neg>": []}

    for label in ["<pos>", "<neg>"]:
        scores = []

        print(f"Calculating similarities for label: {label}")
        for item in grouped_data[label]:
            embedding = item['sentence_embedding']
            similarity = cosine_similarity([test_embedding], [embedding])[0][0]
            scores.append((similarity, item))  # Store the full item here

            # Debugging: Print similarity for each example
            print(f"Review: {item['split_decomposed_review'][:50]}... | Similarity: {similarity:.4f}")

        # Sort by similarity and select top-k
        scores = sorted(scores, key=lambda x: x[0], reverse=True)[:top_k]
        top_k_results[label] = scores

        # Debugging: Print top-k examples with similarity scores
        print(f"\nTop-{top_k} examples for label '{label}':")
        for idx, (similarity, item) in enumerate(scores, 1):
            print(f"{idx}. Similarity: {similarity:.4f} | Sentence: {item['split_decomposed_review'][:50]}...\n")

    return top_k_results

def load_prompt_rag(few_shot_examples):
    """
    Reads and formats the prompt template with the few-shot examples.
    """
    with open('/Users/innerpiece92/Desktop/NLP_Workspace/AArec/prompt/extracting/prompt_for_extractive_ata_classification_restaurants_fs.txt', 'r') as file:
        prompt = file.read()
        
    return prompt.format(few_shot_examples=few_shot_examples)

def logocv_business_grouped_inference(data, model):
    """
    Perform inference using Leave-One-Group-Out Cross-Validation (LOGO-CV),
    grouping data by `business_id`.
    """
    print("Performing inference using Leave-One-Group-Out Cross-Validation (LOGO-CV).")
    system_labels = []

    # Extract groups (business_id) from the data
    groups = [item['business_id'] for item in data]
    logo = LeaveOneGroupOut()

    for fold_idx, (train_indices, test_indices) in enumerate(logo.split(data, groups=groups)):
        test_business_id = groups[test_indices[0]]
        print(f"\nProcessing Fold {fold_idx + 1}: Testing on Business ID {test_business_id}...")

        train_data = [data[i] for i in train_indices]
        test_data = [data[i] for i in test_indices]

        # Precompute embeddings and group by <pos> and <neg>
        grouped_data = precompute_embeddings_and_index(train_data, model, tokenizer)

        for test_instance in test_data:
            test_sentence = test_instance['split_decomposed_review']
            print(f"\nProcessing Test Sentence: {test_sentence[:50]}...\n")

            # Calculate top 4 similar examples for <pos> and <neg>
            top_k_matches = calculate_top_k_examples(test_sentence, grouped_data, model, tokenizer)

            # Prepare few-shot examples directly from top_k_matches
            few_shot_examples = {"<pos>": [], "<neg>": []}

            for label in top_k_matches:
                for similarity, item in top_k_matches[label]:
                    few_shot_examples[label].append(item['raw_chunk'])  # Direct access

                while len(few_shot_examples[label]) < 4:
                    few_shot_examples[label].append(f"Placeholder example for {label}")

            few_shot_formatted = "\n\n".join(
                [f"Example {i+1}:\n\n{chunk}" for i, chunk in enumerate(few_shot_examples["<neg>"] + few_shot_examples["<pos>"])]
            )

            prompt = load_prompt_rag(few_shot_formatted)
            prompt_params = load_prompt_params(test_sentence)

            system_label = generate_model_response(prompt, prompt_params, test_instance)

            if system_label:
                print(f"Predicted Output: {system_label['predicted_output']}\n")
                system_labels.append(system_label)

    return system_labels

### RAG Few-Shot

### Zero-Shot Inference

In [None]:
reviews_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/restaurant_reviews_split_ata_classified.xlsx"
data_to_process = load_reviews_data(reviews_path, debugging=False)
results_zs = zero_shot_inference(data_to_process)

In [None]:
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/system_abstractive_ata_restaurants_reviews_zs.xlsx"
process_results_to_excel(results_zs, output_file)

### Fixed Few-Shot Inference

In [None]:
reviews_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/restaurant_reviews_split_ata_classified.xlsx"
data_to_process = load_reviews_data(reviews_path, debugging=False)
results_fs = fixed_few_shot_inference(data_to_process)

In [None]:
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/system_abstractive_ata_restaurants_reviews_fs.xlsx"
process_results_to_excel(results_fs, output_file)

### Leave-One-Group-Out Inference w/ labelwise

In [None]:
reviews_path = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/pr_restaurant_reviews_split_ata_classified.xlsx"
data_to_process = load_reviews_data(reviews_path, debugging=False)
results_logocv = logocv_business_grouped_inference(data_to_process, model)

In [None]:
output_file = "/Users/innerpiece92/Desktop/NLP_Workspace/AArec/datasets/extracting_atypical_aspects_of_items_from_reviews/restaurants/test/rcb_using_decomposed_review_classification_v3/pr_system_abstractive_ata_restaurants_reviews_rag.xlsx"
process_results_to_excel(results_logocv, output_file)