#### Extracting User Interests and Topics

** Objective **

Analyze LinkedIn profile text to extract structured lists of interests and topics. Use OpenAI’s GPT-4 model for semantic understanding and embeddings.

In [None]:
from langchain.document_loaders import TextLoader
from langchain.chains import load_summarize_chain
from langchain.llms import OpenAI


def extract_user_interests_and_topics(linkedin_profile_text):
    """
    Extracts user interests and topics from LinkedIn profile text using LangChain with GPT-4.

    Args:
        linkedin_profile_text (str): The text of the user's LinkedIn profile.

    Returns:
        dict: A dictionary containing user interests and topics with their embeddings.
    """
    # Initialize LangChain components
    loader = TextLoader(text=linkedin_profile_text)
    documents = loader.load()

    # Load summarization chain
    llm = OpenAI(model="gpt-4", temperature=0.5, max_tokens=1500)
    chain = load_summarize_chain(llm, chain_type="refine")

    # Generate summarized insights
    summary = chain.run(documents)

    # Parse insights into structured format
    interests_prompt = (
        f"""Given the following text, extract five key personal interests as keywords
        and five potential topics for further learning.
        Text: {summary}
        Format:
        - Interests: 1, 2, 3, 4, 5
        - Topics: 1, 2, 3, 4, 5
        """
    )
    insights = llm(interests_prompt)

    # Extract and embed interests and topics
    interests, topics = parse_interests_and_topics(insights)
    return {"interests": interests, "topics": topics}


def parse_interests_and_topics(insights):
    """Parses interests and topics from LLM response."""
    lines = insights.split("\n")
    interests = [line.split(".")[1].strip() for line in lines if line.startswith("- Interests")]
    topics = [line.split(".")[1].strip() for line in lines if line.startswith("- Topics")]
    return interests, topics

# Example usage
linkedin_text = "Experienced data scientist skilled in machine learning, data analysis, and cloud computing."
user_data = extract_user_interests_and_topics(linkedin_text)
print(user_data)

#### Embedding and Storing RSS Feed Content

** Objective **

Embed RSS feed content using OpenAI’s embedding models and store them in Weaviate for efficient retrieval.

In [None]:
import weaviate
from openai import OpenAI


def embed_and_store_rss_content(rss_content_list, weaviate_client, collection_name):
    """
    Embed RSS content and store embeddings in Weaviate vector database.

    Args:
        rss_content_list (list): List of RSS content strings.
        weaviate_client (weaviate.Client): Weaviate client instance.
        collection_name (str): Name of the Weaviate collection to store embeddings.

    Returns:
        None
    """
    # Initialize OpenAI embedding model
    llm = OpenAI(model="text-embedding-ada-002")

    # Iterate through RSS content
    for content in rss_content_list:
        embedding = llm.create_embedding(input=content)
        metadata = {"content": content}

        # Add to Weaviate
        weaviate_client.batch.add_data_object(
            data_object=metadata,
            class_name=collection_name,
            vector=embedding
        )
    weaviate_client.batch.create()

# Example usage
rss_content = ["Breaking news about AI breakthroughs", "New advancements in quantum computing"]
client = weaviate.Client("http://localhost:8080")
embed_and_store_rss_content(rss_content, client, "RSSContent")

#### Retrieving Personalized Content

** Objective **

Retrieve RSS feed content relevant to a user’s interests using hybrid queries in Weaviate.

In [None]:
from typing import List, Dict


def retrieve_personalized_rss_feeds(weaviate_client, user_interests, collection_name, alpha=0.5, limit=5):
    """
    Retrieve personalized RSS feeds based on user interests.

    Args:
        weaviate_client (weaviate.Client): Weaviate client instance.
        user_interests (list): List of user interests as keywords.
        collection_name (str): Weaviate collection name.
        alpha (float): Hybrid query parameter for relevance weighting.
        limit (int): Number of results per interest.

    Returns:
        dict: Retrieved RSS content categorized by interest.
    """
    personalized_feeds = {}

    for interest in user_interests:
        response = weaviate_client.query \
            .get(collection_name) \
            .with_hybrid(query=interest, alpha=alpha) \
            .with_limit(limit) \
            .do()

        results = [
            {
                "title": item["title"],
                "content": item["content"],
                "score": item["_additional"]["score"]
            }
            for item in response["data"]["Get"][collection_name]
        ]

        personalized_feeds[interest] = results

    return personalized_feeds

# Example usage
user_interests = ["AI", "Machine Learning"]
feeds = retrieve_personalized_rss_feeds(client, user_interests, "RSSContent")
print(feeds)

#### Evaluating Recommendations with LLM-Based Relevance Scoring
** Objective **
To assess the quality of recommendations by computing relevance scores using semantic similarity, measured by cosine similarity between user interests and RSS content embeddings.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_relevance_scores(user_embedding, rss_embeddings):
    """
    Compute relevance scores for retrieved RSS content based on cosine similarity.

    Args:
        user_embedding (np.ndarray): Embedding vector of the user's interest or topic.
        rss_embeddings (List[np.ndarray]): List of embedding vectors for RSS content.

    Returns:
        List[float]: Relevance scores for each RSS content.
    """
    scores = [cosine_similarity(user_embedding.reshape(1, -1), rss_emb.reshape(1, -1))[0][0] for rss_emb in rss_embeddings]
    return scores

def evaluate_recommendations(user_embedding, retrieved_content):
    """
    Evaluate the quality of recommendations.

    Args:
        user_embedding (np.ndarray): Embedding vector of the user's interest or topic.
        retrieved_content (List[dict]): Retrieved RSS content with embeddings and metadata.

    Returns:
        dict: Evaluation metrics including mean relevance score, precision@K, and recall@K.
    """
    # Extract RSS embeddings
    rss_embeddings = [item['embedding'] for item in retrieved_content]
    relevance_scores = calculate_relevance_scores(user_embedding, rss_embeddings)

    # Define a relevance threshold
    relevance_threshold = 0.8

    # Mean Relevance Score
    mean_score = np.mean(relevance_scores)

    # Precision@K
    K = 5
    relevant_items = [score >= relevance_threshold for score in relevance_scores]
    precision_at_k = sum(relevant_items[:K]) / K

    # Recall@K
    total_relevant_items = sum(relevant_items)
    recall_at_k = sum(relevant_items[:K]) / total_relevant_items if total_relevant_items > 0 else 0

    return {
        "Mean Relevance Score": mean_score,
        "Precision@K": precision_at_k,
        "Recall@K": recall_at_k
    }


In [None]:

# Step 1: Extract User Interests and Topics
linkedin_profile_text = "Experienced in AI, ML, and data science. Passionate about cloud computing."
user_data = extract_user_interests_and_topics(linkedin_profile_text)

# Step 2: Embed and Store RSS Feed Content
rss_content = ["AI breakthroughs", "Quantum computing advancements"]
embed_and_store_rss_content(rss_content, client, "RSSContent")

# Step 3: Retrieve Personalized Feeds
personalized_feeds = retrieve_personalized_rss_feeds(client, user_data["interests"], "RSSContent")
print(personalized_feeds)

