In [12]:
from dotenv import load_dotenv
load_dotenv()
import os
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
import json

In [4]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pc.create_index(
    name="linkedin-profiles", 
    dimension=1536,  # text-embedding-3-small dimension
    metric="cosine", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
)

In [5]:
sample_profile = {
    "contact": {
        "name": "John Doe",
        "email": "john@example.com",
        "location": "San Francisco Bay Area"
    },
    "summary": "Experienced software engineer...",
    "experience": [
        {
            "title": "Senior Software Engineer",
            "company": "Tech Corp",
            "dates": {"start": "Jan 2020", "end": "Present"},
            "description": "Led development of..."
        }
    ],
    "education": [
        {
            "school": "Stanford University",
            "degree": "Master's",
            "field": "Computer Science",
            "dates": {"start": "Sep 2015", "end": "Jun 2017"}
        }
    ],
    "skills": ["Python", "Machine Learning", "Cloud Computing"],
    "certifications": [
        {
            "name": "AWS Solutions Architect",
            "issuer": "Amazon Web Services",
            "date": "Jan 2023",
            "expires": "Jan 2026"
        }
    ]
}

In [7]:
def profile_to_text(profile):
    """Convert profile sections to a single text for embedding"""
    text_parts = []
    
    # Add contact info
    if "contact" in profile:
        contact = profile["contact"]
        text_parts.append(f"Name: {contact.get('name', '')}")
        text_parts.append(f"Location: {contact.get('location', '')}")
    
    # Add summary
    if "summary" in profile:
        text_parts.append(f"Summary: {profile['summary']}")
    
    # Add experience
    if "experience" in profile:
        text_parts.append("Experience:")
        for exp in profile["experience"]:
            text_parts.append(
                f"{exp['title']} at {exp['company']} "
                f"({exp['dates']['start']} - {exp['dates']['end']})\n"
                f"{exp['description']}"
            )
    
    # Add education
    if "education" in profile:
        text_parts.append("Education:")
        for edu in profile["education"]:
            text_parts.append(
                f"{edu['degree']} in {edu['field']} from {edu['school']} "
                f"({edu['dates']['start']} - {edu['dates']['end']})"
            )
    
    # Add skills
    if "skills" in profile:
        text_parts.append(f"Skills: {', '.join(profile['skills'])}")
    
    # Add certifications
    if "certifications" in profile:
        text_parts.append("Certifications:")
        for cert in profile["certifications"]:
            text_parts.append(
                f"{cert['name']} from {cert['issuer']} "
                f"(Issued: {cert['date']}, Expires: {cert.get('expires', 'N/A')})"
            )
    
    return "\n\n".join(text_parts)

In [13]:
processed_profiles = []
client = OpenAI()

# Convert profile to embeddings
# ... existing code ...

# Convert profile to embeddings
text = profile_to_text(sample_profile)
response = client.embeddings.create(
    input=text,
    model="text-embedding-3-small",
)
embedding = response.data[0].embedding

# Create vector record with serialized profile
processed_profiles.append({
    "values": embedding,
    "id": sample_profile["contact"]["name"],  # Use name as ID
    "metadata": {
        "profile": json.dumps(sample_profile),  # Convert profile to JSON string
        "text": text  # Store processed text for reference
    }
})

# ... existing code ...

In [9]:
processed_profiles[0]

{'values': [0.019412713,
  -0.022112979,
  0.040163394,
  0.015471786,
  0.020823663,
  -0.018634258,
  -0.0041324995,
  0.05239973,
  -0.021018276,
  -0.015897503,
  0.016809756,
  -0.007985242,
  -0.06539019,
  0.010490893,
  0.04619642,
  0.07964564,
  -0.045247678,
  0.013610793,
  -0.01345267,
  0.014900109,
  0.015994811,
  0.019096466,
  0.019826267,
  -0.020020882,
  0.01881671,
  -0.02251437,
  0.0019050854,
  0.040747236,
  0.0015614706,
  -0.0063492707,
  0.042474434,
  -0.012880992,
  -0.042474434,
  0.054637786,
  0.03814428,
  -0.005087323,
  0.03006781,
  -0.01134841,
  0.0027823672,
  -0.017867967,
  -0.0008506746,
  -0.046318054,
  0.022258937,
  0.034276333,
  -0.020446599,
  0.004558217,
  -0.019704634,
  -0.0069635203,
  0.038776774,
  0.05906525,
  0.010089503,
  -0.029824544,
  0.04855611,
  0.020337129,
  -0.05527028,
  -0.021821057,
  0.044907104,
  0.0037402315,
  -0.0078088734,
  -0.028997436,
  0.01378108,
  -0.01773417,
  -0.017466577,
  0.020227658,
  -0.00

In [14]:
index = pc.Index('linkedin-profiles')
index.upsert(
    vectors=processed_profiles,
    namespace="profiles"  # Use a dedicated namespace for profiles
)

{'upserted_count': 1}

In [15]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}