# Gathering Data

In [3]:
from time import sleep

import pandas as pd
from dotenv import load_dotenv

load_dotenv()

profiles_for_processing = pd.read_csv('../data/data_for_report/profiles_for_processing.csv')
relevant_questions = pd.read_csv('../data/data_for_report/relevant_questions.csv')

In [4]:
profiles_for_processing

Unnamed: 0.1,Unnamed: 0,index,q35,q358077,q179268,q41,q44639,q41953,q35660,gender,gender2,d_religion_type,d_drugs,q20930,q16053,lf_want,q9688,d_age,gender.1
0,0,20893,Love,Yes,No,Not at all important,No,Several years,Rarely,Man,Man,Atheism,Never,Average,Totally willing!,Everyone,"Yes, but only soft stuff like marijuana",27.0,Man
1,1,53819,Love,Yes,No,Not at all important,No,A few months to a year,Always,Man,Man,Atheism,Never,Average,"Hesitant, but I'd certainly consider it.",Women,"Yes, but only soft stuff like marijuana",28.0,Man
2,2,4026,Love,No,No,Not at all important,Yes,The rest of my life,Usually,Woman,Woman,Atheism,Never,Below average,"Hesitant, but I'd certainly consider it.",Men,"Yes, but only soft stuff like marijuana",23.0,Woman
3,3,15198,Love,Yes,No,Not at all important,Yes,Several years,Rarely,Woman,Woman,Atheism,Never,Average,"Hesitant, but I'd certainly consider it.",Everyone,No,19.0,Woman
4,4,43113,Love,Yes,No,Extremely important,No,The rest of my life,Usually,Woman,Woman,Judaism,Never,Average,Totally willing!,Men,No,33.0,Woman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,16326,Love,No,No,Somewhat important,Yes,Several years,Rarely,Woman,Woman,Christianity,Never,Average,"Hesitant, but I'd certainly consider it.",Men who like women,No,33.0,Woman
196,196,29876,Love,No,No,Somewhat important,No,The rest of my life,Always,Man,Man,Christianity,Never,Average,Totally willing!,Women,"Yes, but only soft stuff like marijuana",38.0,Man
197,197,32341,Love,Yes,No,Not at all important,Yes,The rest of my life,Usually,Woman,Woman,Atheism,Never,Higher than average,Totally willing!,Everyone,"Yes, but only soft stuff like marijuana",31.0,Woman
198,198,67396,Love,No,No,Somewhat important,No,The rest of my life,Usually,Woman,Woman,Christianity,Never,Average,Totally willing!,Men,No,44.0,Woman


In [6]:
profiles_for_processing['id'] = profiles_for_processing.index
rows, ages = profiles_for_processing.drop(['d_age'], axis=1, inplace=False), profiles_for_processing[['id', 'd_age']]

In [11]:
profiles_for_processing['index']

0      20893
1      53819
2       4026
3      15198
4      43113
       ...  
195    16326
196    29876
197    32341
198    67396
199    29981
Name: index, Length: 200, dtype: int64

# Running predictions


In [4]:
from google import genai
from pydantic import BaseModel

class AgeGuess(BaseModel):
    age: int
    reasoning: str
    confidence: float

def create_prompt(row: pd.Series) -> str:
    prompt = f"""You are an expert demographic analyst tasked with predicting a person's age based on their survey responses and characteristics.

    **Instructions:**
    1. Analyze the provided information carefully
    2. Consider patterns in responses, interests, values, and life circumstances
    3. Provide a specific age estimate (as a number)
    4. Give a brief explanation for your prediction
    5. Be realistic - most online dating users are between 18-65 years old

    **User Information:**
    {row.to_dict()}

    **Required Response Format (JSON):**
    {{
        "predicted_age": [number between 18-100],
        "confidence": [number between 0.0-1.0],
        "explanation": "[brief explanation of reasoning]"
    }}

    Questions you will receive answers to:
    question,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
    q35,"Regardless of future plans, what's more interesting to you right now?",Sex,Love,,,50384,N,,sex/intimacy
    q41,How important is religion/God in your life?,Extremely important,Somewhat important,Not very important,Not at all important,54140,O,,religion/superstition
    q9688,Could you date someone who does drugs?,No,"Yes, but only soft stuff like marijuana",Yes,,55697,O,,preference
    q16053,How willing are you to meet someone from OkCupid in person?,Totally willing!,"Hesitant, but I'd certainly consider it.",I'm not interested in meeting in person.,,58043,O,,preference
    q20930,Rate your self-confidence:,"Very, very high",Higher than average,Average,Below average,53737,O,,descriptive
    q35660,How often are you open with your feelings?,Always,Usually,Rarely,Never,49489,O,,descriptive
    q41953,About how long do you want your next relationship to last?,One night,A few months to a year,Several years,The rest of my life,48614,O,,preference
    q44639,Do you like scary movies?,Yes,No,,,54964,O,,preference
    q179268,Are you either vegetarian or vegan?,Yes,No,,,54202,O,,politics; descriptive
    q358077,Could you date someone who was really messy?,Yes,No,,,55695,O,,preference
    d_religion_type,Religion type,,,,,66365,,,
    d_drugs,Drugs,,,,,55697,,,
    lf_want,Type of match,,,,,66365,,,

    **Important:**
    - Provide ONLY the JSON response, no additional text
    - The predicted_age must be a specific number, not a range
    - Confidence should reflect how certain you are (1.0 = very certain, 0.5 = moderate, 0.1 = uncertain)"""

    return prompt

In [None]:
import pandas as pd
import json
from google import genai
from tqdm.notebook import tqdm  # nice progress bar in Jupyter

# assuming you already have:
# rows -> pd.DataFrame with answers
# ages -> pd.Series with real ages

client = genai.Client()

predictions = []

for idx, row in tqdm(rows.iterrows(), total=len(rows), desc="Predicting ages"):
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=create_prompt(row),
        config={
            "response_mime_type": "application/json"
        },
    )

    # Parse JSON safely
    try:
        result = json.loads(response.text)  # Gemini returns JSON as string
        predicted_age = result.get("predicted_age")
        confidence = result.get("confidence")
        explanation = result.get("explanation")
    except Exception as e:
        print(f"⚠️ Error parsing row {idx}: {e}")
        predicted_age, confidence, explanation = None, None, None

    predictions.append({

        "predicted_age": predicted_age,
        "real_age": ages.loc[idx],
        "confidence": confidence,
        "explanation": explanation
    })

# Build final DataFrame
predicted_ages = pd.DataFrame(predictions)


Predicting ages:   0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
predicted_ages.to_csv('../data/data_for_report/llm_predicted_ages.csv', index=True)

In [10]:
import pandas as pd
import json
from google import genai
from tqdm.notebook import tqdm
from time import sleep

client = genai.Client()

BATCH_SIZE = 10

def create_batch_prompt(batch_rows: pd.DataFrame) -> str:
    prompt = f"""You are an expert demographic analyst tasked with predicting a person's age.

**Instructions:**
- For each profile, give one JSON object with predicted_age, confidence, explanation.
- Return a JSON list, one entry per profile, in the same order.

**Profiles:**
{batch_rows.to_dict(orient="records")}

**Required Response Format (JSON array):**
[
  {{
    "row_index": <row index>,
    "predicted_age": <number>,
    "confidence": <float>,
    "explanation": "<string>"
  }},
  ...
]"""
    return prompt


predictions = []
for start in tqdm(range(0, len(rows), BATCH_SIZE), desc="Predicting ages in batches"):
    batch = rows.iloc[start:start+BATCH_SIZE]

    retries = 4
    while retries > 0:
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash-lite",
                contents=create_batch_prompt(batch),
                config={"response_mime_type": "application/json"},
            )
            break
        except Exception as e:
            retries -= 1
            print(f"⚠️ Error {e}, retries left: {retries}")
            sleep(retries * 2)

    try:
        batch_results = json.loads(response.text)
    except Exception as e:
        print(f"⚠️ Error parsing batch {start}: {e}")
        continue

    for res in batch_results:
        idx = res.get("row_index")
        predictions.append({
            "row_index": idx,
            "predicted_age": res.get("predicted_age"),
            "confidence": res.get("confidence"),
            "explanation": res.get("explanation"),
        })

predicted_ages = pd.DataFrame(predictions)
# predicted_ages.to_csv('../data/data_for_report/llm_predicted_ages.csv', index=True)

Predicting ages in batches:   0%|          | 0/100 [00:00<?, ?it/s]

⚠️ Error Server disconnected without sending a response., retries left: 3
⚠️ Error parsing batch 130: Invalid \escape: line 42 column 403 (char 4607)
⚠️ Error Server disconnected without sending a response., retries left: 3


## Now with retries

In [None]:
import pandas as pd
import json
from google import genai
from tqdm.notebook import tqdm
from time import sleep

client = genai.Client()

BATCH_SIZE = 1

predictions = []
for start in tqdm(range(0, len(rows), BATCH_SIZE), desc="Predicting ages in batches"):
    batch = rows.iloc[start:start+BATCH_SIZE]

    retries = 4
    while retries > 0:
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash-lite",
                contents=create_batch_prompt(batch),
                config={"response_mime_type": "application/json"},
            )
            break
        except Exception as e:
            retries -= 1
            print(f"⚠️ Error {e}, retries left: {retries}")
            sleep(retries * 2)

    try:
        batch_results = json.loads(response.text)
    except Exception as e:
        print(f"⚠️ Error parsing batch {start}: {e}")
        continue

    for res in batch_results:
        idx = res.get("row_index")
        predictions.append({
            "row_index": idx,
            "predicted_age": res.get("predicted_age"),
            "confidence": res.get("confidence"),
            "explanation": res.get("explanation"),
        })

predicted_ages = pd.DataFrame(predictions)
# predicted_ages.to_csv('../data/data_for_report/llm_predicted_ages.csv', index=True)

In [32]:
rows.columns

Index(['Unnamed: 0', 'index', 'q35', 'q358077', 'q179268', 'q41', 'q44639',
       'q41953', 'q35660', 'gender', 'gender2', 'd_religion_type', 'd_drugs',
       'q20930', 'q16053', 'lf_want', 'q9688', 'gender.1', 'id'],
      dtype='object')

## Retries + No Batching + Concurrent

In [15]:
import pandas as pd
import json
from google import genai
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from time import sleep

# Initialize client
client = genai.Client()

def create_prompt(row: pd.Series) -> str:
    prompt = f"""You are an expert demographic analyst tasked with predicting a person's age based on their survey responses and characteristics.

    **Instructions:**
    1. Analyze the provided information carefully
    2. Consider patterns in responses, interests, values, and life circumstances
    3. Provide a specific age estimate (as a number)
    4. Give a brief explanation for your prediction
    5. Be realistic - most online dating users are between 18-65 years old

    **User Information:**
    {row.to_dict()}

    **Required Response Format (JSON):**
    {{
        "predicted_age": [number between 18-100],
        "confidence": [number between 0.0-1.0],
        "explanation": "[brief explanation of reasoning]"
    }}

    Questions you will receive answers to:
    question,text,option_1,option_2,option_3,option_4,N,Type,Order,Keywords
    q35,"Regardless of future plans, what's more interesting to you right now?",Sex,Love,,,50384,N,,sex/intimacy
    q41,How important is religion/God in your life?,Extremely important,Somewhat important,Not very important,Not at all important,54140,O,,religion/superstition
    q9688,Could you date someone who does drugs?,No,"Yes, but only soft stuff like marijuana",Yes,,55697,O,,preference
    q16053,How willing are you to meet someone from OkCupid in person?,Totally willing!,"Hesitant, but I'd certainly consider it.",I'm not interested in meeting in person.,,58043,O,,preference
    q20930,Rate your self-confidence:,"Very, very high",Higher than average,Average,Below average,53737,O,,descriptive
    q35660,How often are you open with your feelings?,Always,Usually,Rarely,Never,49489,O,,descriptive
    q41953,About how long do you want your next relationship to last?,One night,A few months to a year,Several years,The rest of my life,48614,O,,preference
    q44639,Do you like scary movies?,Yes,No,,,54964,O,,preference
    q179268,Are you either vegetarian or vegan?,Yes,No,,,54202,O,,politics; descriptive
    q358077,Could you date someone who was really messy?,Yes,No,,,55695,O,,preference
    d_religion_type,Religion type,,,,,66365,,,
    d_drugs,Drugs,,,,,55697,,,
    lf_want,Type of match,,,,,66365,,,

    **Important:**
    - Provide ONLY the JSON response, no additional text
    - The predicted_age must be a specific number, not a range
    - Confidence should reflect how certain you are (1.0 = very certain, 0.5 = moderate, 0.1 = uncertain)"""

    return prompt


# Function to run prediction for a single row
def predict_single(row: pd.Series):
    retries = 5
    while retries > 0:
        try:
            response = client.models.generate_content(
                model="gemini-2.5-flash",
                contents=create_prompt(row),
                config={"response_mime_type": "application/json"},
            )
            data = json.loads(response.text)
            return {
                "id": row["id"],
                "predicted_age": data.get("predicted_age"),
                "confidence": data.get("confidence"),
                "explanation": data.get("explanation"),
            }
        except Exception as e:
            retries -= 1
            if retries == 0:
                return {
                    "id": row["id"],
                    "predicted_age": None,
                    "confidence": None,
                    "explanation": f"⚠️ Failed after retries: {e}"
                }
            sleep((6-retries) * 2)

# Load your dataframe
# rows = pd.read_csv("...")

predictions = []
with ThreadPoolExecutor(max_workers=5) as executor:  # tune max_workers depending on limits
    futures = {executor.submit(predict_single, row): row["id"] for _, row in rows.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Predicting ages"):
        predictions.append(future.result())

predicted_ages = pd.DataFrame(predictions)
# predicted_ages.to_csv('../data/data_for_report/llm_predicted_ages_v2.csv', index=False)

Predicting ages: 100%|██████████| 200/200 [13:22<00:00,  4.01s/it]


In [20]:
merged = predicted_ages.merge(ages, on='id')
merged['difference'] = (merged['predicted_age'] - merged['d_age']).abs()
merged.to_csv('../data/data_for_report/llm_predicted_ages_v2_with_diff.csv', index=False)

# Playground

In [8]:
for start in tqdm(range(0, len(rows), BATCH_SIZE), desc="Predicting ages in batches"):
    batch = rows.iloc[start:start+BATCH_SIZE]
    print(batch)

Predicting ages in batches:   0%|          | 0/200 [00:00<?, ?it/s]

   Unnamed: 0  index   q35 q358077 q179268                   q41 q44639  \
0           0   7221  Love      No      No    Somewhat important     No   
1           1  58741  Love     Yes      No  Not at all important     No   
2           2   7492   Sex      No      No    Somewhat important    Yes   
3           3  61673  Love      No      No  Not at all important    Yes   
4           4  54206  Love      No      No  Not at all important     No   

                   q41953   q35660 gender gender2 d_religion_type d_drugs  \
0  A few months to a year   Rarely    Man     Man    Christianity   Never   
1           Several years  Usually  Woman   Woman     Agnosticism   Never   
2               One night  Usually    Man     Man    Christianity   Never   
3     The rest of my life  Usually    Man     Man               -   Never   
4           Several years  Usually  Woman   Woman               -   Never   

                q20930            q16053   lf_want  \
0              Average  Totally 

In [9]:
batch

Unnamed: 0.1,Unnamed: 0,index,q35,q358077,q179268,q41,q44639,q41953,q35660,gender,gender2,d_religion_type,d_drugs,q20930,q16053,lf_want,q9688,gender.1
995,995,43825,Sex,Yes,Yes,Not at all important,Yes,The rest of my life,Always,Woman,Woman,-,Never,"Very, very high",Totally willing!,Men,Yes,Woman
996,996,27038,Love,No,No,Somewhat important,No,Several years,Usually,Man,Man,Christianity,Never,Higher than average,Totally willing!,Women,No,Man
997,997,46315,Sex,Yes,No,Not at all important,Yes,A few months to a year,Always,Woman,Woman,Other,Sometimes,Higher than average,Totally willing!,Everyone,"Yes, but only soft stuff like marijuana",Woman
998,998,22872,Love,No,No,Not at all important,No,The rest of my life,Always,Man,Man,Atheism,Never,Average,Totally willing!,Women,No,Man
999,999,24319,Love,Yes,No,Not very important,Yes,Several years,Always,Woman,Woman,Judaism,Never,Higher than average,Totally willing!,Everyone,No,Woman


In [10]:
response = client.models.generate_content(
        model="gemini-2.5-flash-lite",
        contents=create_batch_prompt(batch),
        config={"response_mime_type": "application/json"},
    )


In [11]:
response

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            text="""[
  {
    "row_index": 43825,
    "predicted_age": 25,
    "confidence": 0.7,
    "explanation": "The respondent's answers suggest a younger demographic. For example, 'The rest of my life' for relationship duration and 'Always' for frequency of sex are common in younger individuals exploring relationships. While 'Not at all important' for importance of children can be present at any age, combined with other factors it leans younger."
  },
  {
    "row_index": 27038,
    "predicted_age": 35,
    "confidence": 0.75,
    "explanation": "This profile indicates a mid-range age. 'Several years' for relationship duration, 'Usually' for frequency of sex, and 'Somewhat important' for children are indicative of someone who has had some experience in relationships and is considering more serious commitments or has established a 

In [14]:
try:
    batch_results = json.loads(response.text)
except Exception as e:
    print(f"⚠️ Error parsing batch {start}: {e}")

for res in batch_results:
    idx = res.get("row_index")
    print({
        "row_index": idx,
        "predicted_age": res.get("predicted_age"),
        "real_age": ages.loc[idx],
        "confidence": res.get("confidence"),
        "explanation": res.get("explanation"),
    })


KeyError: 43825

In [15]:
batch_results = json.loads(response.text)

In [17]:
batch_results[0]

{'row_index': 43825,
 'predicted_age': 25,
 'confidence': 0.7,
 'explanation': "The respondent's answers suggest a younger demographic. For example, 'The rest of my life' for relationship duration and 'Always' for frequency of sex are common in younger individuals exploring relationships. While 'Not at all important' for importance of children can be present at any age, combined with other factors it leans younger."}

In [18]:
for res in batch_results:
    idx = res.get("row_index")
    print({
        "row_index": idx,
        "predicted_age": res.get("predicted_age"),
        "real_age": ages.loc[idx],
        "confidence": res.get("confidence"),
        "explanation": res.get("explanation"),
    })

KeyError: 43825

In [25]:
rows

Unnamed: 0.1,Unnamed: 0,index,q35,q358077,q179268,q41,q44639,q41953,q35660,gender,gender2,d_religion_type,d_drugs,q20930,q16053,lf_want,q9688,gender.1
0,0,7221,Love,No,No,Somewhat important,No,A few months to a year,Rarely,Man,Man,Christianity,Never,Average,Totally willing!,Women,"Yes, but only soft stuff like marijuana",Man
1,1,58741,Love,Yes,No,Not at all important,No,Several years,Usually,Woman,Woman,Agnosticism,Never,Higher than average,Totally willing!,Everyone,No,Woman
2,2,7492,Sex,No,No,Somewhat important,Yes,One night,Usually,Man,Man,Christianity,Never,"Very, very high",Totally willing!,Women,No,Man
3,3,61673,Love,No,No,Not at all important,Yes,The rest of my life,Usually,Man,Man,-,Never,Higher than average,Totally willing!,Women,No,Man
4,4,54206,Love,No,No,Not at all important,No,Several years,Usually,Woman,Woman,-,Never,Higher than average,Totally willing!,Men,No,Woman
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,43825,Sex,Yes,Yes,Not at all important,Yes,The rest of my life,Always,Woman,Woman,-,Never,"Very, very high",Totally willing!,Men,Yes,Woman
996,996,27038,Love,No,No,Somewhat important,No,Several years,Usually,Man,Man,Christianity,Never,Higher than average,Totally willing!,Women,No,Man
997,997,46315,Sex,Yes,No,Not at all important,Yes,A few months to a year,Always,Woman,Woman,Other,Sometimes,Higher than average,Totally willing!,Everyone,"Yes, but only soft stuff like marijuana",Woman
998,998,22872,Love,No,No,Not at all important,No,The rest of my life,Always,Man,Man,Atheism,Never,Average,Totally willing!,Women,No,Man
