<a href="https://colab.research.google.com/github/solosolve-ai/solosolve-ai-demo/blob/main/Gemini_SFT_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

Please ensure you have imported a Gemini API key from AI Studio.
You can do this directly in the Secrets tab on the left.

After doing so, please run the setup cell below.

In [1]:
!pip install -U -q "google"
!pip install -U -q "google.genai"
!pip install tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.3/196.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m


In [1]:
import os
from google.colab import userdata
from google.colab import drive
os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")

drive.mount("/content/drive")
# Please ensure that uploaded files are available in the AI Studio folder or change the working folder.
os.chdir("/content/drive/MyDrive/Google AI Studio")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Generated Code

In [2]:
# To run this code you need to install the following dependencies:
# pip install google-genai

import base64
import os
from google import genai
from google.genai import types
from tqdm.auto import tqdm # <--- ADD THIS LINE


def generate():
    client = genai.Client(
        api_key=os.environ.get("GEMINI_API_KEY"),
    )

    model = "gemini-2.5-flash-preview-04-17"
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text="""INSERT_INPUT_HERE"""),
            ],
        ),
    ]
    generate_content_config = types.GenerateContentConfig(
        response_mime_type="text/plain",
    )

    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        print(chunk.text, end="")

if __name__ == "__main__":
    generate()


It looks like you've included the placeholder `INSERT_INPUT_HERE` in your prompt.

Please replace `INSERT_INPUT_HERE` with the actual text, question, or task you'd like me to help you with.

I'm ready when you provide your input!

In [3]:
# Snippet 1: Installs and Setup (Modified for datasets)
!pip install -U -q "google-generativeai" "datasets" "pandas" "ipywidgets" # ipywidgets for TQDM progress bars in datasets

In [4]:
# Snippet 1: Setup, Definitions, and Prompt Templates

import os
import json
import time
import pandas as pd
from google.colab import userdata
from google.colab import drive
import google.generativeai as genai
# from google.generativeai import types # Only if your genai version needs it for GenerationConfig
from datasets import load_dataset # If you load data in this snippet
import random
from datetime import datetime, timedelta # If you use these here

# --- Google Drive and API Key Setup ---
try:
    os.environ["GEMINI_API_KEY"] = userdata.get("GOOGLE_API_KEY")
    if not os.environ["GEMINI_API_KEY"]:
        raise ValueError("GOOGLE_API_KEY not found in Colab secrets.")
    print("GEMINI_API_KEY loaded.")
except Exception as e:
    print(f"Error loading GEMINI_API_KEY: {e}")
    print("Please ensure GOOGLE_API_KEY is set in Colab Secrets (sidebar -> secrets).")
    # For local testing without secrets, you might temporarily set it:
    # os.environ["GEMINI_API_KEY"] = "YOUR_ACTUAL_API_KEY_HERE"

try:
    drive.mount("/content/drive", force_remount=True) # force_remount can be helpful
    GOOGLE_AI_STUDIO_FOLDER = "/content/drive/MyDrive/Google AI Studio/AmazonFashionSFT" # Specific for Fashion
    os.makedirs(GOOGLE_AI_STUDIO_FOLDER, exist_ok=True)
    os.chdir(GOOGLE_AI_STUDIO_FOLDER)
    print(f"Working directory changed to: {os.getcwd()}")
except Exception as e:
    print(f"Error mounting drive or changing directory: {e}")
    print(f"Will use current Colab ephemeral storage for outputs: {os.getcwd()}")
    GOOGLE_AI_STUDIO_FOLDER = "."

# --- Configure Gemini Client ---
try:
    # Configure once per session is usually enough
    if not getattr(genai, '_is_configured_s1_main', False): # Unique flag for this cell
        genai.configure(api_key=os.environ.get("GEMINI_API_KEY", "MISSING_KEY")) # Use .get for safety
        genai._is_configured_s1_main = True
        print("Gemini client configured in Snippet 1.")
    else:
        print("Gemini client already configured.")
except Exception as e:
    print(f"Error configuring Gemini client in Snippet 1: {e}")

# --- SFT Dataset Generation Configuration (Global Constants) ---
SAMPLES_TO_GENERATE_FROM_CURATED = 100
OUTPUT_SFT_FOLDER = os.path.join(GOOGLE_AI_STUDIO_FOLDER, "sft_output_amazon_fashion")
os.makedirs(OUTPUT_SFT_FOLDER, exist_ok=True)
# The SFT_DATASET_FILE will be specifically named for the revised run in Snippet 3
DEFAULT_SFT_FILENAME = "amazon_fashion_sft_data.jsonl" # A base name
REVISED_SFT_FILENAME = "amazon_fashion_sft_data_revised.jsonl"

# For Snippet 2 (Data Loading)
CURATED_COMPLAINTS_FILE = os.path.join(GOOGLE_AI_STUDIO_FOLDER, "curated_fashion_complaints.parquet")
MAX_SFT_EXAMPLES = 1000 # Max examples to curate if file doesn't exist
SEED = 42
RAW_DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
RAW_REVIEW_CONFIG = "raw_review_Amazon_Fashion"
RAW_META_CONFIG = "raw_meta_Amazon_Fashion"
BAD_RATING_THRESHOLD = 3.1
MIN_REVIEW_TEXT_LENGTH = 50

print(f"Output SFT files will be saved in: {OUTPUT_SFT_FOLDER}")
print(f"Curated complaints will be at: {CURATED_COMPLAINTS_FILE}")

# --- Mock Amazon Policy and Categories ---
AMAZON_POLICY_DICT = {
    "Return Window": "Most items sold on Amazon.com can be returned within 30 days of delivery. Some products have different policies or requirements associated with them.",
    "Non-Returnable Items": "Some items cannot be returned, including: digital music, grocery products, customized products, and items classified as hazardous materials.",
    "Damaged or Defective": "If an item arrives damaged, defective, or is not the item you ordered, please contact Customer Service immediately for a refund or replacement. Photographic evidence may be required.",
    "Fashion Item Returns": "Fashion items (clothing, shoes, jewelry, watches) must be returned in new and unworn condition, with all original packaging, tags, and certificates of authenticity (if applicable). Items showing signs of wear or use, or those that have been altered, resized, or damaged by the customer, may not be eligible for return or may incur a restocking fee.",
    "Refund Process": "Once we receive your return, it will be processed within 5-7 business days. Refunds will be issued to the original payment method."
}
COMPLAINT_CATEGORIES = ["Sizing Issue", "Damaged Item", "Not as Described", "Shipping Problem", "Policy Inquiry", "Late Delivery", "Wrong Item Received", "Quality Issue", "Return Process Issue"]
DECISION_TYPES = ["Full_Refund_No_Return", "Full_Refund_With_Return", "Partial_Refund_No_Return", "Partial_Refund_With_Return", "Exchange_Offered", "Deny_Request_Policy_Violation", "Further_Information_Required", "Escalate_To_Human_Agent", "Provide_Policy_Information"]
EMOTIONAL_TONES_FOR_FORMAL_ANSWER = ["Empathetic_Standard", "Neutral_Direct", "Understanding_Apologetic", "Firm_Polite", "Helpful_Informative"]

# --- Constructing the JSON Schema String and the Master Prompt Template ---

# Step 1: Create the raw JSON schema string with its dynamic parts interpolated.
# The literal braces for the JSON structure itself are single here.
_complaint_categories_str = ', '.join(COMPLAINT_CATEGORIES)
_decision_types_str = ', '.join(DECISION_TYPES)
_emotional_tones_str = ', '.join(EMOTIONAL_TONES_FOR_FORMAL_ANSWER)

_raw_json_schema_text = f"""{{
  "is_actionable_complaint": "<boolean (true if the complaint describes an issue that Amazon can or should act upon based on policy and context; false if it's a general comment, question not needing action, or clearly outside policy)>",
  "complaint_category": "<choose ONE most relevant category from: {_complaint_categories_str}>",
  "complaint_summary": "<concise summary of the complaint, 1-2 sentences. If product images were analyzed and relevant, integrate visual findings briefly.>",
  "key_entities_from_complaint": ["<list of 3-5 key nouns/phrases from complaint text, e.g., 'dress too small', 'broken zipper', 'late delivery'>"],
  "image_analysis_summary": "<Based on 'Image URLs Provided by Customer' (if any): describe visual evidence relevant to the complaint. If no images were provided or they are irrelevant, state 'No relevant images provided/analyzed'. THIS FIELD IS CRITICAL.>",
  "information_completeness_assessment": {{
    "is_complete": "<boolean (is all information needed to apply policy and make a decision present in the complaint and provided context? E.g., order ID, specific defect details, desired outcome if not clear)>",
    "missing_information_prompt": "<If is_complete is false, formulate a polite and specific question to the customer to get the missing information, e.g., 'Could you please provide your order ID so I can look into this?'. Use 'NA' if is_complete is true.>"
  }},
  "decision_recommendation": "<based on policy, complaint, and history, choose ONE decision type from: {_decision_types_str}>",
  "suggested_refund_percentage": "<integer (e.g., 0, 50, 100), logically derived from the policy, complaint severity, and decision_recommendation. E.g., 100 for Full_Refund, 0 for Deny_Request.>",
  "return_instructions_if_applicable": "<specific instructions if a return is needed (e.g., 'Please use the pre-paid label sent to your email to return the item.'), or 'NA' if no return is applicable or decision is pending information.>",
  "reasoning_for_decision": "<VERY DETAILED. Explain step-by-step how you reached the decision. Explicitly cite relevant 'Amazon Return Policy Snippets' by their title (e.g., 'As per the Return Window policy...'). Connect complaint specifics, user history (if relevant), and image_analysis_summary (if relevant) to the policy and your decision. If policy is ambiguous or information is missing, explain that.>",
  "formal_answer_to_customer": "[TONE: <choose ONE tone from: {_emotional_tones_str}>] Dear Customer, ... <craft a full, polite, professional response text for the customer. This response should reflect the decision, reasoning, required actions (if any), and information completeness assessment (e.g., ask for more info if needed).>"
}}"""
print("_raw_json_schema_text prepared.")

# Step 2: Create the ESCAPED version for embedding into REVISED_MASTER_PROMPT_TEMPLATE.
# All literal braces from _raw_json_schema_text are doubled.
ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT = _raw_json_schema_text.replace('{', '{{').replace('}', '}}')
print("ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT prepared.")

# Step 3: Define REVISED_MASTER_PROMPT_TEMPLATE using the ESCAPED schema.
# The placeholders for .format() in THIS template (e.g., {{user_history_str}}) are single braces
# because this f-string itself is not being formatted again; its .format() method will be called later.
REVISED_MASTER_PROMPT_TEMPLATE = f"""
You are an AI assistant, an Amazon Resolution Expert, tasked with analyzing Amazon Fashion customer complaints.
Your goal is to generate a perfect, gold-standard, structured JSON response to guide customer service actions.
Adhere strictly to the provided JSON schema and all guidelines.

**CONTEXT FOR ANALYSIS:**

1.  **AMAZON RETURN POLICY SNIPPETS (Simulated RAG):**
    Policy Title: Return Window
    Policy Text: "{AMAZON_POLICY_DICT.get('Return Window', 'Not Available')}"

    Policy Title: Non-Returnable Items
    Policy Text: "{AMAZON_POLICY_DICT.get('Non-Returnable Items', 'Not Available')}"

    Policy Title: Damaged or Defective
    Policy Text: "{AMAZON_POLICY_DICT.get('Damaged or Defective', 'Not Available')}"

    Policy Title: Fashion Item Returns
    Policy Text: "{AMAZON_POLICY_DICT.get('Fashion Item Returns', 'Not Available')}"

    Policy Title: Refund Process
    Policy Text: "{AMAZON_POLICY_DICT.get('Refund Process', 'Not Available')}"

2.  **DECISION GUIDELINES & REFUND RULES:**
    - Prioritize exchanges for sizing issues if the item is eligible.
    - Full refunds are typically for defective/damaged items, or items not as described where an exchange is not feasible or desired by policy.
    - Deny requests that clearly violate return policy (e.g., worn items, past return window without valid reason).
    - If information is missing to make a clear decision, request it.
    - Consider user history for context but apply policy consistently.

3.  **PRODUCT CONTEXT:**
    {{product_title}} <!-- Placeholder for .format() -->
    Product ASIN: {{product_asin}}
    Product Price: ${{product_price}}
    Main Category: {{product_main_category}}
    Store: {{product_store}}
    Key Features: {{product_features}}
    Average Rating: {{product_avg_rating}}/5 ({{product_rating_number}} reviews)
    Often Bought Together With (ASINs): {{product_bought_together}}

4.  **TEMPORAL CONTEXT:**
    Simulated Purchase Date: {{purchase_date_str}}
    Current Date (Review Date): {{complaint_timestamp_iso}}
    (Assess if the complaint falls within a typical 30-day return window from the simulated purchase date, considering the 'Return Window' policy.)

5.  **CUSTOMER'S INTERACTION HISTORY (Simulated MCP):**
    {{user_history_str}}

6.  **CURRENT COMPLAINT:**
    Complaint Rating Given by User: {{complaint_rating_given}}
    Complaint Title: "{{complaint_title_text}}"
    Complaint Body: "{{complaint_body_text}}"

7.  **IMAGE ANALYSIS TASK:**
    Image URLs Provided (Product Images for context): {{image_urls_str}}
    **Instruction:** If image URLs are provided AND seem relevant to understanding the product or a potential defect described, YOU MUST analyze these images (conceptually, based on URLs) and reflect your findings in the 'image_analysis_summary' field of your JSON output. If no URLs are provided, they are irrelevant (e.g., stock photos not showing a defect), or you cannot analyze them, state 'No relevant images provided/analyzed' or 'Product images reviewed, no specific defect visible relevant to complaint.'

**YOUR TASK:**
Based on ALL the information above, generate a SINGLE, VALID JSON object adhering EXACTLY to the schema below.
Do NOT include any text outside of this JSON object.

**REQUIRED JSON OUTPUT SCHEMA:**
{ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT}

Response (JSON only):
"""
print("REVISED_MASTER_PROMPT_TEMPLATE prepared.")

# --- Define generate_sft_entry_with_gemini function ---
def generate_sft_entry_with_gemini(model_instance_for_gen, prompt_text_for_gen):
    if not model_instance_for_gen:
        print("Gemini model instance not provided to generation function.") # Use print for non-loop context
        return None
    try:
        gen_config_class = genai.GenerationConfig
        if hasattr(genai, 'types') and hasattr(genai.types, 'GenerationConfig'): # Check for older genai.types
            gen_config_class = genai.types.GenerationConfig

        generation_config_obj = gen_config_class(
            temperature=0.3,
            response_mime_type="text/plain"
        )
        response = model_instance_for_gen.generate_content(
            contents=prompt_text_for_gen,
            generation_config=generation_config_obj
        )
        if response and response.candidates and response.candidates[0].content.parts:
            return response.candidates[0].content.parts[0].text
        else:
            feedback_msg = "N/A"
            if response and hasattr(response, 'prompt_feedback') and response.prompt_feedback:
                feedback_msg = str(response.prompt_feedback)
            elif response and hasattr(response, 'candidates') and not response.candidates:
                 feedback_msg = "No candidates returned."
            print(f"Warning: No content parts in Gemini response. Feedback: {feedback_msg}")
            return None
    except Exception as exc:
        print(f"Error during Gemini generate_content: {exc}")
        return None
print("generate_sft_entry_with_gemini function defined.")

GEMINI_API_KEY loaded.
Mounted at /content/drive
Working directory changed to: /content/drive/MyDrive/Google AI Studio/AmazonFashionSFT
Gemini client configured in Snippet 1.
Output SFT files will be saved in: /content/drive/MyDrive/Google AI Studio/AmazonFashionSFT/sft_output_amazon_fashion
Curated complaints will be at: /content/drive/MyDrive/Google AI Studio/AmazonFashionSFT/curated_fashion_complaints.parquet
_raw_json_schema_text prepared.
ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT prepared.
REVISED_MASTER_PROMPT_TEMPLATE prepared.
generate_sft_entry_with_gemini function defined.


In [5]:
from google.colab import output
output.enable_custom_widget_manager()

In [6]:
# Snippet 2: Data Loading (Reviews and Metadata Separately)

# --- Constants for data loading (ensure these are defined before use) ---
# These should ideally be passed as arguments or loaded from a config if this were a larger project.
# For now, we'll redefine them here, assuming Snippet 1 might not have been run or its globals are not accessible.

RAW_DATASET_NAME = "McAuley-Lab/Amazon-Reviews-2023"
RAW_REVIEW_CONFIG = "raw_review_Amazon_Fashion"
RAW_META_CONFIG = "raw_meta_Amazon_Fashion"
BAD_RATING_THRESHOLD = 3.1
MIN_REVIEW_TEXT_LENGTH = 50

# Constants that were originally in Snippet 1 and are needed here:
GOOGLE_AI_STUDIO_FOLDER = "/content/drive/MyDrive/Google AI Studio/AmazonFashionSFT" # Or your defined path
CURATED_COMPLAINTS_FILE = os.path.join(GOOGLE_AI_STUDIO_FOLDER, "curated_fashion_complaints.parquet")
MAX_SFT_EXAMPLES = 1000 # As defined in Snippet 1
SEED = 42 # As defined in Snippet 1
SAMPLES_TO_GENERATE_FROM_CURATED = 100 # As defined in Snippet 1


print("\n--- Step 1: Loading Raw Review Data & Curating Complaints ---")

# Load full review dataset
try:
    print(f"Loading reviews from '{RAW_DATASET_NAME}', config '{RAW_REVIEW_CONFIG}'...")
    review_dataset_raw = load_dataset(RAW_DATASET_NAME, RAW_REVIEW_CONFIG, split='full', trust_remote_code=True)
    print(f"Successfully loaded {len(review_dataset_raw)} raw reviews.")
except Exception as e:
    print(f"Failed to load review dataset: {e}")
    review_dataset_raw = None

df_reviews_full = pd.DataFrame()
if review_dataset_raw:
    df_reviews_full = review_dataset_raw.to_pandas()
    print(f"Converted raw reviews to Pandas DataFrame with shape: {df_reviews_full.shape}")

# Initialize df_complaints_for_sft to an empty DataFrame
df_complaints_for_sft = pd.DataFrame()

# Apply your curation logic
if not df_reviews_full.empty:
    df_complaints_filtered = df_reviews_full[
        (df_reviews_full['rating'] < BAD_RATING_THRESHOLD) &
        (df_reviews_full['text'].astype(str).str.len() >= MIN_REVIEW_TEXT_LENGTH)
    ].copy()
    print(f"Found {len(df_complaints_filtered)} potential complaints after filtering.")

    # Ensure the target directory for CURATED_COMPLAINTS_FILE exists
    os.makedirs(os.path.dirname(CURATED_COMPLAINTS_FILE), exist_ok=True)

    # Logic for saving/loading curated complaints
    if not os.path.exists(CURATED_COMPLAINTS_FILE) or (os.path.exists(CURATED_COMPLAINTS_FILE) and len(df_complaints_filtered) > 0 and os.path.getsize(CURATED_COMPLAINTS_FILE) == 0):
        # Create or overwrite if file doesn't exist, or if it exists but is empty and we have new filtered complaints
        print(f"'{CURATED_COMPLAINTS_FILE}' does not exist or is empty, and new complaints are available. Creating/Overwriting...")
        num_to_sample_for_saving = min(len(df_complaints_filtered), MAX_SFT_EXAMPLES * 2)
        if num_to_sample_for_saving > 0:
            df_complaints_sample_to_save = df_complaints_filtered.sample(num_to_sample_for_saving, random_state=SEED)
            df_complaints_sample_to_save.to_parquet(CURATED_COMPLAINTS_FILE)
            print(f"Saved {len(df_complaints_sample_to_save)} curated complaints to {CURATED_COMPLAINTS_FILE}")
            # Sample from the newly saved data for the current SFT run
            df_complaints_for_sft = df_complaints_sample_to_save.sample(min(len(df_complaints_sample_to_save), SAMPLES_TO_GENERATE_FROM_CURATED), random_state=SEED)
        else:
            print("No complaints to sample after filtering. Curated file not created/updated.")
    elif os.path.exists(CURATED_COMPLAINTS_FILE):
        print(f"Loading existing curated complaints from {CURATED_COMPLAINTS_FILE}")
        try:
            df_complaints_loaded = pd.read_parquet(CURATED_COMPLAINTS_FILE)
            if not df_complaints_loaded.empty:
                 df_complaints_for_sft = df_complaints_loaded.sample(min(len(df_complaints_loaded), SAMPLES_TO_GENERATE_FROM_CURATED), random_state=SEED)
            else:
                print(f"Loaded curated complaints file '{CURATED_COMPLAINTS_FILE}' is empty.")
        except Exception as e:
            print(f"Error loading curated complaints file '{CURATED_COMPLAINTS_FILE}': {e}. Will attempt to regenerate if possible.")
            # Fallback: if loading fails and we have filtered data, try to regenerate
            if len(df_complaints_filtered) > 0:
                num_to_sample_for_saving = min(len(df_complaints_filtered), MAX_SFT_EXAMPLES * 2)
                df_complaints_sample_to_save = df_complaints_filtered.sample(num_to_sample_for_saving, random_state=SEED)
                df_complaints_sample_to_save.to_parquet(CURATED_COMPLAINTS_FILE)
                print(f"Re-saved {len(df_complaints_sample_to_save)} curated complaints to {CURATED_COMPLAINTS_FILE}")
                df_complaints_for_sft = df_complaints_sample_to_save.sample(min(len(df_complaints_sample_to_save), SAMPLES_TO_GENERATE_FROM_CURATED), random_state=SEED)


    if not df_complaints_for_sft.empty:
        print(f"Using {len(df_complaints_for_sft)} complaint samples for SFT data generation.")
    else:
        print("No complaint samples available for SFT data generation after curation/loading process.")

else:
    print("Review DataFrame (df_reviews_full) is empty. Cannot proceed with complaint curation.")


print("\n--- Step 2: Loading Raw Metadata ---")
all_meta_df_unique = pd.DataFrame() # Initialize
try:
    print(f"Loading metadata from '{RAW_DATASET_NAME}', config '{RAW_META_CONFIG}'...")
    meta_dataset_raw = load_dataset(RAW_DATASET_NAME, RAW_META_CONFIG, split='full', trust_remote_code=True)
    all_meta_df = meta_dataset_raw.to_pandas()
    print(f"Successfully loaded {len(all_meta_df)} metadata entries. Shape: {all_meta_df.shape}")

    if 'parent_asin' in all_meta_df.columns:
        all_meta_df_unique = all_meta_df.drop_duplicates(subset=['parent_asin'], keep='first')
        print(f"Unique metadata entries by parent_asin: {len(all_meta_df_unique)}")
    else:
        print("Error: 'parent_asin' column not found in metadata. Cannot deduplicate or merge effectively.")

except Exception as e:
    print(f"Failed to load metadata dataset: {e}")


--- Step 1: Loading Raw Review Data & Curating Complaints ---
Loading reviews from 'McAuley-Lab/Amazon-Reviews-2023', config 'raw_review_Amazon_Fashion'...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Successfully loaded 2500939 raw reviews.
Converted raw reviews to Pandas DataFrame with shape: (2500939, 10)
Found 549327 potential complaints after filtering.
Loading existing curated complaints from /content/drive/MyDrive/Google AI Studio/AmazonFashionSFT/curated_fashion_complaints.parquet
Using 100 complaint samples for SFT data generation.

--- Step 2: Loading Raw Metadata ---
Loading metadata from 'McAuley-Lab/Amazon-Reviews-2023', config 'raw_meta_Amazon_Fashion'...
Successfully loaded 826108 metadata entries. Shape: (826108, 16)
Unique metadata entries by parent_asin: 826108


Support for third party widgets will remain active for the duration of the session. To disable support:

In [None]:
# Snippet 3: Merging Data and SFT Generation (Revised with DeepSeek Feedback)

import pandas as pd
import json
import time
import random
import os
from tqdm.auto import tqdm
import google.generativeai as genai
from datetime import datetime, timedelta # For temporal context

# --- Ensure Gemini is configured (ideally done in Snippet 1) ---
if "GEMINI_API_KEY" in os.environ and os.environ["GEMINI_API_KEY"] != "YOUR_API_KEY_HERE_IF_NOT_IN_SECRETS":
    try:
        if not getattr(genai, '_is_configured_s3_revised', False):
            genai.configure(api_key=os.environ["GEMINI_API_KEY"])
            genai._is_configured_s3_revised = True
            print("Gemini configured in Snippet 3 (Revised).")
    except Exception as e:
        print(f"Error configuring Gemini in Snippet 3 (Revised): {e}")
else:
    print("GEMINI_API_KEY not available for configuration in Snippet 3 (Revised).")

# --- Retrieve necessary variables from global scope (set in Snippet 1 & 2) ---
df_complaints_for_sft = globals().get('df_complaints_for_sft', pd.DataFrame())
all_meta_df_unique = globals().get('all_meta_df_unique', pd.DataFrame())
df_reviews_full = globals().get('df_reviews_full', pd.DataFrame())

# --- Constants from Snippet 1 needed for SFT generation loop ---
# (These were defined in Snippet 1 and are expected to be in globals())
AMAZON_POLICY_DICT = globals().get("AMAZON_POLICY_DICT", {})
COMPLAINT_CATEGORIES = globals().get("COMPLAINT_CATEGORIES", [])
DECISION_TYPES = globals().get("DECISION_TYPES", [])
EMOTIONAL_TONES_FOR_FORMAL_ANSWER = globals().get("EMOTIONAL_TONES_FOR_FORMAL_ANSWER", [])

SAMPLES_TO_GENERATE_FROM_CURATED = globals().get("SAMPLES_TO_GENERATE_FROM_CURATED", 0)
SFT_DATASET_FILE = globals().get("SFT_DATASET_FILE", "default_sft_output_revised.jsonl") # New filename
OUTPUT_SFT_FOLDER = globals().get("OUTPUT_SFT_FOLDER", ".")
if SFT_DATASET_FILE == "default_sft_output_revised.jsonl": # Update if default was used
    SFT_DATASET_FILE = os.path.join(OUTPUT_SFT_FOLDER, "amazon_fashion_sft_data_revised.jsonl")


# --- Revised EXPECTED_JSON_SCHEMA_FOR_PROMPT and MASTER_PROMPT_TEMPLATE ---
# (This should be defined in Snippet 1, retrieving from globals here for robustness)
_raw_json_schema_text_from_globals = globals().get("_raw_json_schema_text", "")
ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT = _raw_json_schema_text_from_globals.replace('{', '{{').replace('}', '}}')

# --- New MASTER_PROMPT_TEMPLATE incorporating DeepSeek feedback ---
REVISED_MASTER_PROMPT_TEMPLATE = f"""
You are an AI assistant, an Amazon Resolution Expert, tasked with analyzing Amazon Fashion customer complaints.
Your goal is to generate a perfect, gold-standard, structured JSON response to guide customer service actions.
Adhere strictly to the provided JSON schema and all guidelines.

**CONTEXT FOR ANALYSIS:**

1.  **AMAZON RETURN POLICY SNIPPETS (Simulated RAG):**
    Policy Title: Return Window
    Policy Text: "{AMAZON_POLICY_DICT.get('Return Window', 'Not Available')}"

    Policy Title: Non-Returnable Items
    Policy Text: "{AMAZON_POLICY_DICT.get('Non-Returnable Items', 'Not Available')}"

    Policy Title: Damaged or Defective
    Policy Text: "{AMAZON_POLICY_DICT.get('Damaged or Defective', 'Not Available')}"

    Policy Title: Fashion Item Returns
    Policy Text: "{AMAZON_POLICY_DICT.get('Fashion Item Returns', 'Not Available')}"

    Policy Title: Refund Process
    Policy Text: "{AMAZON_POLICY_DICT.get('Refund Process', 'Not Available')}"

2.  **DECISION GUIDELINES & REFUND RULES:**
    - Prioritize exchanges for sizing issues if the item is eligible.
    - Full refunds are typically for defective/damaged items, or items not as described where an exchange is not feasible or desired by policy.
    - Deny requests that clearly violate return policy (e.g., worn items, past return window without valid reason).
    - If information is missing to make a clear decision, request it.
    - Consider user history for context but apply policy consistently.

3.  **PRODUCT CONTEXT:**
    Product Title: {{product_title}}
    Product ASIN: {{product_asin}}
    Product Price: ${{product_price}}
    Main Category: {{product_main_category}}
    Store: {{product_store}}
    Key Features: {{product_features}}
    Average Rating: {{product_avg_rating}}/5 ({{product_rating_number}} reviews)
    Often Bought Together With (ASINs): {{product_bought_together}}

4.  **TEMPORAL CONTEXT:**
    Simulated Purchase Date: {{purchase_date_str}}
    Current Date (Review Date): {{complaint_timestamp_iso}}
    (Assess if the complaint falls within a typical 30-day return window from the simulated purchase date, considering the 'Return Window' policy.)

5.  **CUSTOMER'S INTERACTION HISTORY (Simulated MCP):**
    {{user_history_str}}

6.  **CURRENT COMPLAINT:**
    Complaint Rating Given by User: {{complaint_rating_given}}
    Complaint Title: "{{complaint_title_text}}"
    Complaint Body: "{{complaint_body_text}}"

7.  **IMAGE ANALYSIS TASK:**
    Image URLs Provided (Product Images for context): {{image_urls_str}}
    **Instruction:** If image URLs are provided AND seem relevant to understanding the product or a potential defect described, YOU MUST analyze these images (conceptually, based on URLs) and reflect your findings in the 'image_analysis_summary' field of your JSON output. If no URLs are provided, they are irrelevant (e.g., stock photos not showing a defect), or you cannot analyze them, state 'No relevant images provided/analyzed' or 'Product images reviewed, no specific defect visible relevant to complaint.'

**YOUR TASK:**
Based on ALL the information above, generate a SINGLE, VALID JSON object adhering EXACTLY to the schema below.
Do NOT include any text outside of this JSON object.

**REQUIRED JSON OUTPUT SCHEMA:**
{ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT}

Response (JSON only):
"""

# --- Define generate_sft_entry_with_gemini if not already defined ---
if 'generate_sft_entry_with_gemini' not in globals():
    print("Redefining 'generate_sft_entry_with_gemini' in Snippet 3 (Revised) for safety.")
    def generate_sft_entry_with_gemini(model_instance_for_gen, prompt_text_for_gen):
        if not model_instance_for_gen:
            tqdm.write("Gemini model instance not provided to generation function.")
            return None
        try:
            gen_config_class = genai.GenerationConfig
            if hasattr(genai, 'types') and hasattr(genai.types, 'GenerationConfig'):
                gen_config_class = genai.types.GenerationConfig
            generation_config_obj = gen_config_class(temperature=0.3, response_mime_type="text/plain")
            response = model_instance_for_gen.generate_content(
                contents=prompt_text_for_gen,
                generation_config=generation_config_obj
            )
            if response and response.candidates and response.candidates[0].content.parts:
                return response.candidates[0].content.parts[0].text
            else:
                feedback_msg = "N/A"
                if response and hasattr(response, 'prompt_feedback') and response.prompt_feedback: feedback_msg = str(response.prompt_feedback)
                elif response and hasattr(response, 'candidates') and not response.candidates: feedback_msg = "No candidates returned."
                tqdm.write(f"Warning: No content parts in Gemini response. Feedback: {feedback_msg}")
                return None
        except Exception as exc:
            tqdm.write(f"Error during Gemini generate_content: {exc}")
            return None

print("\n--- Step 3: Merging Curated Complaints with Metadata (Revised) ---")
merged_df_for_sft = pd.DataFrame()
review_merge_key = None
meta_parent_asin_col_in_merged = 'parent_asin'

if not df_complaints_for_sft.empty and not all_meta_df_unique.empty:
    if 'parent_asin' in df_complaints_for_sft.columns: review_merge_key = 'parent_asin'
    elif 'asin' in df_complaints_for_sft.columns:
        print("Using 'asin' from reviews as merge key for metadata.")
        review_merge_key = 'asin'
    else: print("Error: Neither 'parent_asin' nor 'asin' found in curated complaints DataFrame. Cannot merge.")

    if review_merge_key and 'parent_asin' in all_meta_df_unique.columns:
        merged_df_for_sft = pd.merge(df_complaints_for_sft, all_meta_df_unique, left_on=review_merge_key, right_on='parent_asin', how='left', suffixes=('_review', '_meta'))
        print(f"Merged DataFrame for SFT shape: {merged_df_for_sft.shape}")
        print("Columns in merged_df_for_sft:", merged_df_for_sft.columns.tolist())
        if review_merge_key == 'parent_asin' and 'parent_asin_meta' in merged_df_for_sft.columns: meta_parent_asin_col_in_merged = 'parent_asin_meta'
        elif 'parent_asin' in merged_df_for_sft.columns: meta_parent_asin_col_in_merged = 'parent_asin'
        else: print(f"Warning: Could not definitively identify metadata's parent_asin column. Defaulting to '{meta_parent_asin_col_in_merged}'.")
        if meta_parent_asin_col_in_merged not in merged_df_for_sft.columns: print(f"Error: Identified metadata parent_asin column ('{meta_parent_asin_col_in_merged}') not found!")
        else:
            print(f"Using '{meta_parent_asin_col_in_merged}' as the metadata parent_asin column.")
            num_successfully_merged = merged_df_for_sft[meta_parent_asin_col_in_merged].notna().sum()
            print(f"Number of complaints successfully merged with metadata: {num_successfully_merged}")
            critical_meta_cols = ['title_meta', meta_parent_asin_col_in_merged]
            if not all(col in merged_df_for_sft.columns for col in critical_meta_cols): print(f"Warning: Not all critical_meta_cols ({critical_meta_cols}) found. Skipping dropna.")
            else:
                merged_df_for_sft.dropna(subset=critical_meta_cols, inplace=True)
                print(f"Shape after dropping rows with missing critical metadata: {merged_df_for_sft.shape}")
    else: print("Cannot merge due to missing key columns or empty input DataFrames.")
else: print("Curated complaints (df_complaints_for_sft) or unique metadata (all_meta_df_unique) is empty. Skipping merge.")

def get_user_history_str(user_id, current_complaint_timestamp, full_reviews_df, max_history=2):
    # (Function definition as provided previously - ensure df_reviews_full is correctly passed)
    if full_reviews_df.empty or 'user_id' not in full_reviews_df.columns: return "User history lookup unavailable (no full review data)."
    user_reviews = full_reviews_df[full_reviews_df['user_id'] == user_id].copy()
    if user_reviews.empty: return "No prior complaint history found for this user in our records."
    user_reviews['timestamp_dt'] = pd.to_datetime(user_reviews['timestamp'], unit='ms', errors='coerce')
    current_ts_dt = pd.to_datetime(current_complaint_timestamp, unit='ms', errors='coerce')
    if pd.isna(current_ts_dt): return "Current complaint timestamp invalid, cannot reliably fetch history."
    past_reviews = user_reviews[user_reviews['timestamp_dt'] < current_ts_dt].sort_values(by='timestamp_dt', ascending=False)
    if past_reviews.empty: return "No prior complaint history found for this user (older than current complaint)."
    history_str = f"User has {len(past_reviews)} prior review(s) on record (showing up to {max_history}):\n"
    for i, (_, row_hist) in enumerate(past_reviews.head(max_history).iterrows()):
        history_str += f"- Review Title: \"{row_hist.get('title', 'N/A')}\", Rating: {row_hist.get('rating', 'N/A')}, Product ASIN: {row_hist.get('asin', 'N/A')} (Timestamp: {row_hist.get('timestamp_dt', pd.NaT).strftime('%Y-%m-%d') if pd.notna(row_hist.get('timestamp_dt')) else 'N/A'})\n"
    return history_str.strip()

sft_generated_data = []
total_start_time = time.time()
successful_generations = 0
failed_generations = 0
gemini_model_instance = None

if "GEMINI_API_KEY" in os.environ and os.environ["GEMINI_API_KEY"] != "YOUR_API_KEY_HERE_IF_NOT_IN_SECRETS":
    try:
        MODEL_TO_USE = "gemini-2.5-flash-preview-04-17"
        gemini_model_instance = genai.GenerativeModel(MODEL_TO_USE)
        print(f"Gemini model instance '{MODEL_TO_USE}' initialized.")
    except Exception as e: print(f"Error initializing Gemini model: {e}")
else: print("GEMINI_API_KEY not configured properly. Gemini calls will be skipped.")

if not REVISED_MASTER_PROMPT_TEMPLATE or not ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT : print("ERROR: REVISED_MASTER_PROMPT_TEMPLATE or its schema component is empty. Ensure Snippet 1 has run successfully.")
if SAMPLES_TO_GENERATE_FROM_CURATED == 0: print("Warning: SAMPLES_TO_GENERATE_FROM_CURATED is 0.")

if not merged_df_for_sft.empty and gemini_model_instance and REVISED_MASTER_PROMPT_TEMPLATE and ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT and SAMPLES_TO_GENERATE_FROM_CURATED > 0:
    num_samples_to_process = min(len(merged_df_for_sft), SAMPLES_TO_GENERATE_FROM_CURATED)
    print(f"\nAttempting to generate {num_samples_to_process} SFT entries using REVISED prompt...")
    for index, row in tqdm(merged_df_for_sft.head(num_samples_to_process).iterrows(), total=num_samples_to_process, desc="Generating Revised SFT Entries"):
        entry_start_time = time.time()
        user_id = row.get('user_id', 'UnknownUser')
        current_timestamp_val = row.get('timestamp_review', row.get('timestamp'))
        user_history_str = get_user_history_str(user_id, current_timestamp_val, df_reviews_full)

        # Temporal Context
        complaint_datetime = pd.to_datetime(current_timestamp_val, unit='ms', errors='coerce')
        complaint_timestamp_iso = complaint_datetime.isoformat() if pd.notna(complaint_datetime) else 'N/A'
        # Simulate purchase date as 1 to 30 days before review
        simulated_purchase_datetime = complaint_datetime - timedelta(days=random.randint(1, 30)) if pd.notna(complaint_datetime) else pd.NaT
        purchase_date_str = simulated_purchase_datetime.strftime('%Y-%m-%d') if pd.notna(simulated_purchase_datetime) else 'Unknown'

        # Image URLs
        product_image_urls = []
        images_data = None
        for name in ['images_meta', 'images_review', 'images']:
            if name in row.index and pd.notna(row[name]): images_data = row[name]; break
        if isinstance(images_data, dict):
            large_images_list = images_data.get('large')
            if isinstance(large_images_list, list) and large_images_list:
                product_image_urls.extend([url for url in large_images_list if isinstance(url, str) and url][:3]) # Limit to 3 images
        image_urls_str = ", ".join(product_image_urls) if product_image_urls else "No product images available"

        # Additional Product Context
        features_list = row.get('features', []) # 'features' is a list
        product_features_str = ", ".join(features_list) if isinstance(features_list, list) and features_list else "Not available"

        bought_together_list = row.get('bought_together', []) # 'bought_together' might be a list of ASINs
        product_bought_together_str = ", ".join(bought_together_list) if isinstance(bought_together_list, list) and bought_together_list else "Not available"

        current_review_merge_key = review_merge_key if review_merge_key else 'parent_asin'
        current_meta_parent_asin_col = meta_parent_asin_col_in_merged if meta_parent_asin_col_in_merged else 'parent_asin'

        filled_prompt = REVISED_MASTER_PROMPT_TEMPLATE.format(
            user_history_str=user_history_str,
            product_title=row.get('title_meta', row.get('title_review', 'N/A')),
            product_asin=row.get(current_meta_parent_asin_col, row.get(current_review_merge_key, 'N/A')),
            product_price=row.get('price_meta', row.get('price', 'N/A')),
            product_main_category=row.get('main_category_meta', row.get('main_category', 'N/A')),
            product_store=row.get('store_meta', row.get('store', 'N/A')),
            product_features=product_features_str,
            product_avg_rating=row.get('average_rating', '?'),
            product_rating_number=row.get('rating_number', 0),
            product_bought_together=product_bought_together_str,
            purchase_date_str=purchase_date_str,
            complaint_timestamp_iso=complaint_timestamp_iso,
            complaint_rating_given=row.get('rating_review', row.get('rating', 'N/A')),
            complaint_title_text=row.get('title_review', row.get('title', 'N/A')),
            complaint_body_text=row.get('text_review', row.get('text', '')),
            image_urls_str=image_urls_str
        )
        generated_text = generate_sft_entry_with_gemini(gemini_model_instance, filled_prompt)
        if generated_text:
            clean_text = generated_text.strip()
            if clean_text.startswith("```json"): clean_text = clean_text[7:-3].strip()
            elif clean_text.startswith("```"): clean_text = clean_text[3:-3].strip()
            try:
                parsed_json_output = json.loads(clean_text)
                sft_generated_data.append({"id": f"sft_sample_revised_{row.name}", "input_prompt_to_gemini": filled_prompt, "gemini_json_output": parsed_json_output})
                successful_generations += 1
            except json.JSONDecodeError as json_e:
                failed_generations += 1
                tqdm.write(f"Failed to parse JSON for sample (Index: {row.name}): {json_e}")
                sft_generated_data.append({"id": f"sft_sample_revised_{row.name}_PARSE_ERROR", "input_prompt_to_gemini": filled_prompt, "gemini_raw_output": clean_text})
        else:
            failed_generations += 1
            tqdm.write(f"Gemini returned no text for sample (Index: {row.name}).")
        time.sleep(random.uniform(0.5, 1.5))
else:
    # Print reasons for skipping
    if not REVISED_MASTER_PROMPT_TEMPLATE or not ESCAPED_JSON_SCHEMA_FOR_MASTER_PROMPT: print("ERROR: REVISED_MASTER_PROMPT_TEMPLATE or its schema component is empty.")
    if SAMPLES_TO_GENERATE_FROM_CURATED == 0: print("Warning: SAMPLES_TO_GENERATE_FROM_CURATED is 0.")
    if merged_df_for_sft.empty: print("merged_df_for_sft is empty.")
    if not gemini_model_instance: print("Gemini model instance is not initialized.")
    print("\nSkipping SFT generation due to missing data, uninitialized model, missing prompt template, or 0 samples to generate.")

total_time_taken = time.time() - total_start_time
print(f"\n--- SFT Data Generation Summary (Revised Prompt) ---")
print(f"Total time taken: {total_time_taken:.2f} seconds")
print(f"Successfully generated entries: {successful_generations}")
print(f"Failed generations/parses: {failed_generations}")

if sft_generated_data:
    print(f"\nSaving {len(sft_generated_data)} generated SFT entries to {SFT_DATASET_FILE}...")
    os.makedirs(os.path.dirname(SFT_DATASET_FILE), exist_ok=True)
    with open(SFT_DATASET_FILE, 'w') as f:
        for entry in sft_generated_data: f.write(json.dumps(entry) + '\n')
    print("SFT data saved.")
    if sft_generated_data:
        print("\n--- First generated SFT entry (Revised Prompt) ---")
        entry = sft_generated_data[0]
        print("Input Prompt (truncated):")
        prompt_display = str(entry.get("input_prompt_to_gemini", ""))
        print(prompt_display[:1000] + "...") # Print more of the prompt
        print("\nGemini Output (JSON or Raw on error):")
        if "gemini_json_output" in entry: print(json.dumps(entry["gemini_json_output"], indent=2))
        else: print(entry.get("gemini_raw_output", "Error: No output recorded"))
        print("-" * 50)
else: print("\nNo SFT data was generated to save or print (Revised Prompt).")
print(f"\nCheck the folder '{OUTPUT_SFT_FOLDER}' for the output file: {SFT_DATASET_FILE}")

Gemini configured in Snippet 3 (Revised).

--- Step 3: Merging Curated Complaints with Metadata (Revised) ---
Merged DataFrame for SFT shape: (100, 25)
Columns in merged_df_for_sft: ['rating', 'title_review', 'text', 'images_review', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase', 'main_category', 'title_meta', 'average_rating', 'rating_number', 'features', 'description', 'price', 'images_meta', 'videos', 'store', 'categories', 'details', 'bought_together', 'subtitle', 'author']
Using 'parent_asin' as the metadata parent_asin column.
Number of complaints successfully merged with metadata: 100
Shape after dropping rows with missing critical metadata: (100, 25)
Gemini model instance 'gemini-2.5-flash-preview-04-17' initialized.

Attempting to generate 100 SFT entries using REVISED prompt...


Generating Revised SFT Entries:   0%|          | 0/100 [00:00<?, ?it/s]