# Artist SimilarityThis notebook loads artist data, generates embeddings with Sentence Transformers, stores them in PostgreSQL with pgvector, and lets you run similarity searches.

## 1. SetupInstall dependencies once per environment.

In [None]:
# !python3 -m pip install -r requirements.txt

Configure PostgreSQL connection details. Update these placeholders with your actual database credentials or load them from environment variables.

In [None]:
import osPGHOST = os.getenv("PGHOST", "localhost")PGPORT = os.getenv("PGPORT", "5432")PGDATABASE = os.getenv("PGDATABASE", "postgres")PGUSER = os.getenv("PGUSER", "postgres")PGPASSWORD = os.getenv("PGPASSWORD", "")

## 2. Imports and Constants

In [None]:
import pandas as pdimport psycopgfrom pgvector.psycopg import register_vectorfrom psycopg.rows import dict_rowfrom sentence_transformers import SentenceTransformerMODEL_NAME = "sentence-transformers/all-mpnet-base-v2"VECTOR_DIM = 768DATA_PATH = "artist_training_data.csv"

## 3. Helper Functions

In [None]:
def get_connection():    conn = psycopg.connect(        host=PGHOST,        port=PGPORT,        dbname=PGDATABASE,        user=PGUSER,        password=PGPASSWORD or None,    )    register_vector(conn)    return conn

In [None]:
def ensure_schema(conn):    with conn.cursor() as cur:        cur.execute("CREATE EXTENSION IF NOT EXISTS vector")        cur.execute(            f"""            CREATE TABLE IF NOT EXISTS artist_embeddings (                id SERIAL PRIMARY KEY,                name TEXT UNIQUE,                genre TEXT,                location TEXT,                email TEXT,                instagram TEXT,                nonprofit_interest TEXT,                artist_bio TEXT,                nonprofit_reasoning TEXT,                embedding VECTOR({VECTOR_DIM})            )            """        )        cur.execute(            "CREATE UNIQUE INDEX IF NOT EXISTS idx_artist_embeddings_name ON artist_embeddings (name)"        )    conn.commit()

In [None]:
def build_documents(records):    docs = []    for row in records:        parts = []        for key in (            "name",            "genre",            "location",            "nonprofit_interest",            "artist_bio",            "nonprofit_reasoning",        ):            value = row.get(key)            if pd.notna(value) and value:                label = key.replace("_", " ").title()                parts.append(f"{label}: {value}")        docs.append("".join(parts))    return docs

In [None]:
def encode_documents(documents, model_name=MODEL_NAME):    model = SentenceTransformer(model_name)    embeddings = model.encode(documents, convert_to_numpy=True, normalize_embeddings=True)    return model, embeddings

In [None]:
def upsert_embeddings(conn, df, embeddings):    def sanitize(value):        if pd.isna(value):            return None        return value    with conn.cursor() as cur:        for row, vector in zip(df.itertuples(index=False), embeddings):            cur.execute(                """                INSERT INTO artist_embeddings (                    name,                    genre,                    location,                    email,                    instagram,                    nonprofit_interest,                    artist_bio,                    nonprofit_reasoning,                    embedding                )                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)                ON CONFLICT (name) DO UPDATE SET                    genre = EXCLUDED.genre,                    location = EXCLUDED.location,                    email = EXCLUDED.email,                    instagram = EXCLUDED.instagram,                    nonprofit_interest = EXCLUDED.nonprofit_interest,                    artist_bio = EXCLUDED.artist_bio,                    nonprofit_reasoning = EXCLUDED.nonprofit_reasoning,                    embedding = EXCLUDED.embedding                """,                (                    sanitize(row.name),                    sanitize(row.genre),                    sanitize(row.location),                    sanitize(row.email),                    sanitize(row.instagram),                    sanitize(row.nonprofit_interest),                    sanitize(row.artist_bio),                    sanitize(row.nonprofit_reasoning),                    vector,                ),            )    conn.commit()

In [None]:
def search_similar(conn, model, query, limit=5):    embedding = model.encode([query], convert_to_numpy=True, normalize_embeddings=True)[0]    with conn.cursor(row_factory=dict_row) as cur:        cur.execute(            """            SELECT                name,                genre,                location,                email,                instagram,                nonprofit_interest,                artist_bio,                nonprofit_reasoning,                embedding <=> %s AS distance            FROM artist_embeddings            ORDER BY embedding <=> %s            LIMIT %s            """,            (embedding, embedding, limit),        )        return cur.fetchall()

## 4. Load and Embed Dataset

In [None]:
artists_df = pd.read_csv(DATA_PATH)documents = build_documents(artists_df.to_dict("records"))len(documents)

In [None]:
model, embeddings = encode_documents(documents)embeddings.shape

In [None]:
with get_connection() as conn:    ensure_schema(conn)    upsert_embeddings(conn, artists_df, embeddings)

## 5. Run Similarity Queries

In [None]:
query = "folk singer passionate about environmental conservation"limit = 5with get_connection() as conn:    results = search_similar(conn, model, query, limit)results

In [None]:
for idx, row in enumerate(results, start=1):    distance = row.get("distance", 0.0)    print(f"{idx}. {row['name']} (distance={distance:.4f})")    for key, value in row.items():        if key in ("name", "distance") or value in (None, ""):            continue        print(f"   {key.replace('_', ' ').title()}: {value}")