# Gathering Data

In [None]:
from time import sleep

import pandas as pd
from dotenv import load_dotenv

load_dotenv()

profiles_for_processing = pd.read_csv('../data/data_for_report/profiles_for_processing.csv').drop(columns=['Unnamed: 0'], errors='ignore')
relevant_questions = pd.read_csv('../data/data_for_report/relevant_questions.csv')

In [None]:
# profiles_for_processing['id'] = profiles_for_processing.index
rows, ages = profiles_for_processing.drop(['d_age'], axis=1, inplace=False), profiles_for_processing[['row_index', 'd_age']]

In [None]:
# profiles_for_processing['index']

# Running predictions


## Retries + No Batching + Concurrent Gemini Flash 2.5

In [None]:
def create_prompt(row: pd.Series) -> str:
    prompt = f"""You are an expert demographic analyst tasked with predicting a person's age based on their survey responses and characteristics.

    **Instructions:**
    1. Analyze the provided information carefully
    2. Consider patterns in responses, interests, values, and life circumstances
    3. Provide a specific age estimate (as a number)
    4. Give a brief explanation for your prediction
    5. Be realistic - most online dating users are between 18-65 years old

    **User Information:**
    {row.to_dict()}

    **Required Response Format (JSON):**
    {{
        "predicted_age": [number between 18-100],
        "confidence": [number between 0.0-1.0],
        "explanation": "[brief explanation of reasoning]"
    }}

    Questions you will receive answers to:
    question,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
    q35,"Regardless of future plans, what's more interesting to you right now?",Sex,Love,,,50384,N,,sex/intimacy
    q41,How important is religion/God in your life?,Extremely important,Somewhat important,Not very important,Not at all important,54140,O,,religion/superstition
    q9688,Could you date someone who does drugs?,No,"Yes, but only soft stuff like marijuana",Yes,,55697,O,,preference
    q16053,How willing are you to meet someone from OkCupid in person?,Totally willing!,"Hesitant, but I'd certainly consider it.",I'm not interested in meeting in person.,,58043,O,,preference
    q20930,Rate your self-confidence:,"Very, very high",Higher than average,Average,Below average,53737,O,,descriptive
    q35660,How often are you open with your feelings?,Always,Usually,Rarely,Never,49489,O,,descriptive
    q41953,About how long do you want your next relationship to last?,One night,A few months to a year,Several years,The rest of my life,48614,O,,preference
    q44639,Do you like scary movies?,Yes,No,,,54964,O,,preference
    q179268,Are you either vegetarian or vegan?,Yes,No,,,54202,O,,politics; descriptive
    q358077,Could you date someone who was really messy?,Yes,No,,,55695,O,,preference
    d_religion_type,Religion type,,,,,66365,,,
    d_drugs,Drugs,,,,,55697,,,
    lf_want,Type of match,,,,,66365,,,

    **Important:**
    - Provide ONLY the JSON response, no additional text
    - The predicted_age must be a specific number, not a range
    - Confidence should reflect how certain you are (1.0 = very certain, 0.5 = moderate, 0.1 = uncertain)"""

    return prompt

In [None]:
# Function to run prediction for a single row
def predict_single(row: pd.Series, model: str = "gemini-2.5-flash") -> dict:
    retries = 6
    while retries > 0:
        try:
            response = client.models.generate_content(
                model=model,
                contents=create_prompt(row),
                config={"response_mime_type": "application/json"},
            )
            data = json.loads(response.text)
            return {
                "row_index": row["row_index"],
                "predicted_age": data.get("predicted_age"),
                "confidence": data.get("confidence"),
                "explanation": data.get("explanation"),
            }
        except Exception as e:
            retries -= 1
            if retries == 0:
                print(f"⚠️ Failed for row {row['row_index']} after retries: {e}")
                return {
                    "row_index": row["row_index"],
                    "predicted_age": None,
                    "confidence": None,
                    "explanation": f"⚠️ Failed after retries: {e}"
                }
            sleep(10)

    print(f"⚠️ Failed for row {row['row_index']} after running out of retries")
    return {
            "row_index": row["row_index"],
            "predicted_age": None,
            "confidence": None,
            "explanation": f"⚠️ Failed after running out of retries"
        }

In [None]:
import pandas as pd
import json
from google import genai
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from time import sleep

# Initialize client
client = genai.Client()

# Load your dataframe
# rows = pd.read_csv("...")

predictions = []
with ThreadPoolExecutor(max_workers=5) as executor:  # tune max_workers depending on limits
    futures = {executor.submit(predict_single, row): row["row_index"] for _, row in rows.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Predicting ages"):
        predictions.append(future.result())

predicted_ages = pd.DataFrame(predictions)

## Going over the failed ones with Gemini 2.5 Pro

In [None]:
failed = merged[merged.explanation.str.contains('⚠️')]
failed_rows = rows[rows.row_index.isin(failed.row_index)]


retrying_predictions = []
with ThreadPoolExecutor(max_workers=5) as executor:  # tune max_workers depending on limits
    futures = {executor.submit(predict_single, row, "gemini-2.5-flash"): row["row_index"] for _, row in failed_rows.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Predicting ages"):
        retrying_predictions.append(future.result())

retrying_predicted_ages = pd.DataFrame(retrying_predictions)
# predicted_ages.to_csv('../data/data_for_report/llm_predicted_ages_v3.csv', index=False)

## Merging the results and saving

In [None]:
# Step 1: Set 'row_index' as the index for both DataFrames
predicted_ages.set_index('row_index', inplace=True)
retrying_predicted_ages.set_index('row_index', inplace=True)

# Step 2: Replace rows in predicted_ages with those from retrying_predicted_ages
predicted_ages.update(retrying_predicted_ages)

# Step 3 (optional): Reset index if you want 'row_index' back as a column
predicted_ages.reset_index(inplace=True)
predicted_ages

In [None]:
merged = predicted_ages.merge(ages, on='row_index')
merged['difference'] = (merged['predicted_age'] - merged['d_age']).abs()
merged.to_csv('../data/data_for_report/llm_predicted_ages_v3_with_diff.csv', index=False)

In [None]:
merged.difference.describe()

# Graphing

## Graph 1

In [None]:
import plotly.express as px

fig = px.scatter(
    merged,
    x="d_age",                # Real age
    y="predicted_age",        # Predicted age
    color="confidence",       # Confidence as color
    color_continuous_scale="Viridis",
    hover_data=["row_index", "difference", "explanation"],
    labels={
        "d_age": "Real Age",
        "predicted_age": "Predicted Age",
        "confidence": "Confidence (0-1)"
    },
    title="Predicted Age vs Real Age with Confidence Coloring"
)

# Add y = x line to indicate perfect predictions
fig.add_shape(
    type="line",
    x0=merged['d_age'].min(),
    y0=merged['d_age'].min(),
    x1=merged['d_age'].max(),
    y1=merged['d_age'].max(),
    line=dict(color="gray", dash="dash"),
    name="Perfect Prediction Line"
)

# Final layout tweaks
fig.update_layout(
    width=800,
    height=600,
    legend_title_text='Confidence',
    coloraxis_colorbar=dict(title="Confidence"),
    xaxis=dict(title="Real Age"),
    yaxis=dict(title="Predicted Age"),
)

fig.show()

## Graph 2

In [None]:
fig = px.scatter(
    merged,
    x="d_age",
    y="difference",
    color="confidence",
    color_continuous_scale="RdYlGn",
    hover_data=["predicted_age", "row_index", "confidence"],
    labels={
        "d_age": "Real Age",
        "difference": "Prediction Error (Predicted - Real)",
        "confidence": "Confidence"
    },
    title="Prediction Error vs Real Age (Colored by Confidence)"
)

fig.add_hline(y=0, line_dash="dash", line_color="gray")

fig.update_layout(
    height=600,
    width=900,
    xaxis_title="Real Age",
    yaxis_title="Prediction Error (years)",
)

fig.show()


# Playground

In [None]:
import pandas as pd
import plotly.express as px

# Creating a sample dataframe with the mentioned columns
# This is just a dummy data for plot structure illustration
# Replace this with actual dataframe if available

data = {
    'row_index': range(1, 101),
    'predicted_age': [25 + (x % 10) for x in range(100)],
    'confidence': [0.8 - (x % 15) * 0.02 for x in range(100)],
    'explanation': ['Feature importance - X' for _ in range(100)],
    'd_age': [24 + (x % 12) for x in range(100)],
    'difference': [(25 + (x % 10)) - (24 + (x % 12)) for x in range(100)]
}

df = pd.DataFrame(data)

# Adding mean age prediction for comparison (mean actual age as constant prediction)
mean_age = df['d_age'].mean()
df['mean_age_pred'] = mean_age

# 1st plot: Scatter plot of predicted_age vs actual age (d_age) with confidence as color
fig1 = px.scatter(df, x='d_age', y='predicted_age', color='confidence', title='Predicted Age vs Actual Age Colored by Confidence', labels={'d_age': 'Actual Age', 'predicted_age': 'Predicted Age'})

# 2nd plot: Histogram of the error (difference)
fig2 = px.histogram(df, x='difference', nbins=30, title='Distribution of Prediction Errors (Difference)', labels={'difference': 'Prediction Error (Predicted Age - Actual Age)'})

# 3rd plot: Compare model prediction error vs mean age prediction error (absolute errors)
df['abs_error_model'] = df['difference'].abs()
df['abs_error_mean'] = (df['mean_age_pred'] - df['d_age']).abs()
fig3 = px.scatter(df, x='abs_error_mean', y='abs_error_model', title='Comparison: Model Prediction Error vs Mean Age Prediction Error',
                  labels={'abs_error_mean': 'Mean Age Prediction Error', 'abs_error_model': 'Model Prediction Error'},
                  trendline='ols')

fig1.show()
fig2.show()
fig3.show()

In [None]:
rows.columns.drop('Unnamed: 0')

In [None]:
for start in tqdm(range(0, len(rows), BATCH_SIZE), desc="Predicting ages in batches"):
    batch = rows.iloc[start:start+BATCH_SIZE]
    print(batch)

In [None]:
batch

In [None]:
response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=create_batch_prompt(batch),
        config={"response_mime_type": "application/json"},
    )


In [None]:
response

In [None]:
try:
    batch_results = json.loads(response.text)
except Exception as e:
    print(f"⚠️ Error parsing batch {start}: {e}")

for res in batch_results:
    idx = res.get("row_index")
    print({
        "row_index": idx,
        "predicted_age": res.get("predicted_age"),
        "real_age": ages.loc[idx],
        "confidence": res.get("confidence"),
        "explanation": res.get("explanation"),
    })


In [None]:
batch_results = json.loads(response.text)

In [None]:
batch_results[0]

In [None]:
for res in batch_results:
    idx = res.get("row_index")
    print({
        "row_index": idx,
        "predicted_age": res.get("predicted_age"),
        "real_age": ages.loc[idx],
        "confidence": res.get("confidence"),
        "explanation": res.get("explanation"),
    })

In [None]:
rows