In [4]:
import random
import json

names = ["Alice", "Bob", "Charlie", "Diana", "Eva", "Frank", "Grace", "Henry", "Ivy", "Jack",
         "Kira", "Liam", "Mona", "Nate", "Olivia", "Paul", "Quinn", "Rosa", "Sam", "Tina",
         "Uma", "Viktor", "Wendy", "Xander", "Yara", "Zane", "Ava", "Ben", "Cleo", "Dan",
         "Elle", "Finn", "Gwen", "Hugo", "Ines", "Jules", "Kate", "Leo", "Mira", "Noah",
         "Omar", "Pia", "Rex", "Sophie", "Tom", "Ursula", "Vera", "Will", "Zoe", "Gianni",
         "Lucia", "Miguel", "Anya", "Chloe", "Deniz", "Erik", "Fatima", "Gustav", "Hana", "Imran",
         "Jana", "Karim", "Lars", "Mei", "Nia", "Oli", "Priya", "Qasim", "Riya", "Sven", "Tariq",
         "Ulrik", "Viola", "Wei", "Xenia", "Yuki", "Zara", "Arjun", "Bianca", "Cedric", "Daisy",
         "Emil", "Farah", "Gino", "Helena", "Ida", "Jonas", "Khalid", "Lea", "Matteo", "Nadia",
         "Oksana", "Pascal", "Ruben", "Selma", "Tarek", "Valerie", "Waleed", "Yasir", "Zahra"]

locations = ["New York", "San Francisco", "Chicago", "Berlin", "London", "Paris", "Tokyo", "Sydney", "Dubai", "Toronto",
             "Barcelona", "Rome", "Munich", "Amsterdam", "Stockholm", "Beijing", "Moscow", "Delhi", "Seoul", "Cape Town"]

interests = [
    "hiking", "AI", "photography", "traveling", "robotics", "cooking", "painting", "reading", "dancing", "writing",
    "music", "movies", "fitness", "yoga", "swimming", "cycling", "gardening", "gaming", "volunteering", "meditation",
    "chess", "podcasting", "history", "finance", "languages"
]

languages_all = ["en", "es", "fr", "de", "zh", "ja", "ru", "it", "hi", "ar", "pt", "tr", "ko"]

def random_bio(name, interests, location, main_trait):
    bios = [
        f"{name} is a {main_trait} soul from {location} who loves {random.choice(interests)}.",
        f"Avid about {random.choice(interests)}, {name} enjoys exploring {location}.",
        f"Passionate about {', '.join(interests[:2])}, {name} is always seeking new experiences in {location}.",
        f"{name} finds peace in {random.choice(interests)} and is curious about people in {location}.",
        f"With a keen interest in {random.choice(interests)}, {name} thrives on creativity and learning."
    ]
    return random.choice(bios)

users = []
for i in range(100):
    user_name = names[i % len(names)] + str(i)  # unique names
    age = random.randint(20, 40)
    location = random.choice(locations)
    user_interests = random.sample(interests, k=random.randint(4, 7))
    # psychological perspective: Big Five + a few more, all between 0.2 and 0.95
    personality = {
        "extraversion": round(random.uniform(0.2, 0.95), 2),
        "openness": round(random.uniform(0.2, 0.95), 2),
        "agreeableness": round(random.uniform(0.2, 0.95), 2),
        "conscientiousness": round(random.uniform(0.2, 0.95), 2),
        "neuroticism": round(random.uniform(0.2, 0.95), 2),
        "creativity": round(random.uniform(0.2, 0.95), 2),
        "curiosity": round(random.uniform(0.2, 0.95), 2),
        "risk_taking": round(random.uniform(0.2, 0.95), 2),
        "empathy": round(random.uniform(0.2, 0.95), 2),
        "self_discipline": round(random.uniform(0.2, 0.95), 2)
    }
    languages = random.sample(languages_all, k=random.randint(1, 3))
    main_trait = max(personality, key=personality.get)
    bio = random_bio(user_name, user_interests, location, main_trait)
    user = {
        "name": user_name,
        "age": age,
        "location": location,
        "interests": user_interests,
        "personality": personality,
        "languages": languages,
        "bio": bio
    }
    users.append(user)

# Save to file for LLM use
with open("psychological_user_profiles.json", "w", encoding="utf-8") as f:
    json.dump(users, f, indent=4)

# Print a few sample users
for user in users[:3]:
    print(json.dumps(user, indent=2))


{
  "name": "Alice0",
  "age": 36,
  "location": "Cape Town",
  "interests": [
    "hiking",
    "volunteering",
    "history",
    "swimming",
    "dancing"
  ],
  "personality": {
    "extraversion": 0.56,
    "openness": 0.72,
    "agreeableness": 0.42,
    "conscientiousness": 0.47,
    "neuroticism": 0.42,
    "creativity": 0.2,
    "curiosity": 0.48,
    "risk_taking": 0.93,
    "empathy": 0.42,
    "self_discipline": 0.77
  },
  "languages": [
    "en",
    "de",
    "it"
  ],
  "bio": "With a keen interest in volunteering, Alice0 thrives on creativity and learning."
}
{
  "name": "Bob1",
  "age": 24,
  "location": "Rome",
  "interests": [
    "swimming",
    "volunteering",
    "reading",
    "movies",
    "gardening",
    "podcasting"
  ],
  "personality": {
    "extraversion": 0.72,
    "openness": 0.35,
    "agreeableness": 0.3,
    "conscientiousness": 0.52,
    "neuroticism": 0.8,
    "creativity": 0.53,
    "curiosity": 0.39,
    "risk_taking": 0.48,
    "empathy": 0.67,


In [3]:
len(users)

100

## Step 2: Setup Matching System
We’ll build:

Embeddings for interests and bio

A composite scorer

A matching loop

In [3]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 1: Compute embeddings for interests + bio
for user in users:
    interest_str = ", ".join(user["interests"])
    user["emb_interest"] = model.encode(interest_str, convert_to_tensor=True)
    user["emb_bio"] = model.encode(user["bio"], convert_to_tensor=True)

# Step 2: Composite scoring function
def score_users(u, v, weights):
    # Skip self-match
    if u["name"] == v["name"]:
        return -1

    # Interests similarity
    sim_int = util.cos_sim(u["emb_interest"], v["emb_interest"]).item()

    # Bio similarity
    sim_bio = util.cos_sim(u["emb_bio"], v["emb_bio"]).item()

    # Personality similarity
    traits_u = np.array(list(u["personality"].values()))
    traits_v = np.array(list(v["personality"].values()))
    sim_per = 1 - np.linalg.norm(traits_u - traits_v) / np.sqrt(len(traits_u))

    # Language overlap
    lang_u = set(u["languages"])
    lang_v = set(v["languages"])
    sim_lang = len(lang_u & lang_v) / len(lang_u | lang_v)

    # Final weighted score
    score = (
        weights["interests"] * sim_int +
        weights["bio"] * sim_bio +
        weights["personality"] * sim_per +
        weights["language"] * sim_lang
    )
    return score

# Step 3: Match each user to the best other user
weights = {"interests": 0.4, "bio": 0.2, "personality": 0.3, "language": 0.1}

for user in users:
    best_match = None
    best_score = -1
    for other_user in users:
        score = score_users(user, other_user, weights)
        if score > best_score:
            best_score = score
            best_match = other_user

    print(f"{user['name']} is best matched with {best_match['name']} (Score: {best_score:.2f})")


2025-08-07 18:27:13.140013: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-07 18:27:13.179433: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-07 18:27:13.179461: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-07 18:27:13.180410: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-07 18:27:13.186864: I tensorflow/core/platform/cpu_feature_guar

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Alice is best matched with Diana (Score: 0.77)
Bob is best matched with Alice (Score: 0.64)
Charlie is best matched with Diana (Score: 0.56)
Diana is best matched with Alice (Score: 0.77)
