In [2]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.0.201-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting openapi-schema-pydantic<2.0,>=1.2
  Using cached openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
Collecting langchainplus-sdk>=0.0.9
  Using cached langchainplus_sdk-0.0.10-py3-none-any.whl (21 kB)
Collecting numexpr<3.0.0,>=2.8.4
  Using cached numexpr-2.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
Collecting pydantic<2,>=1
  Using cached pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting dataclasses-json<0.6.0,>=0.5.7
  Using cached dataclasses_json-0.5.8-py3-none-any.whl (26 kB)
Collecting marshmallow-enum<2.0.0,>=1.5.1
  Using cached marshmallow_enum-1.5.1-py2.py3-none-any.whl (4.2 kB)
Collecting typing-inspect>=0.4.0
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Col

In [79]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def get_topk_similar_articles(embeddings, article_index, k, negativSimilartiy=False):
    """
    Get the top k news articles with highest similarity for a given index
    
    embeddings: embeddings matrix
    article_index: target article
    k = get the top k articles
    include_negative: if abs values should be used
    return top articles for given index
    """

    # Calculate pairwise cosine similarity with sklearn library
    similarity_scores = cosine_similarity(embeddings)
    
    # get similarity scores for the given article index
    article_scores = similarity_scores[article_index]
        
    # get negative similarity or positive ones
    # argsort returns the indices of the sorted array
    if negativSimilartiy:
        # [:k] to return only the top k article indices 
        sorted_indices = np.argsort(article_scores)[:k] 
    else:
        # [::-1] to order in decending order
        # 1:k+1 to exclude the target-article itself (similartiy = 1)
        sorted_indices = np.argsort(article_scores)[::-1][1:k+1] 
    
    # Get the top k (anti-) similar articles
    top_articles = [(index, article_scores[index]) for index in sorted_indices if index != article_index]
 
    return top_articles

In [5]:
# Create fake Embeddings for testing

from langchain.embeddings import FakeEmbeddings
embeddings = FakeEmbeddings(size=3)
embd = embeddings.embed_documents(["foo", 
                                   'test', 
                                   'asfd', 
                                   'helloworld', 
                                   'sdff', 
                                   'sdfddf', 
                                   'akljsdfkl asdölkf jasl kö', 
                                   'foo'])

8


In [81]:
# TEST Function with and without abs-values

target_article = 0
k = 3

# Get top similar articles for article at index 0
top_articles = get_topk_similar_articles(embd, target_article, k, negativSimilartiy=False)

print(f"Top {k} Articles for Index: {target_article}:")
# Print the top similar articles
for article_index, similarity in top_articles:
    print(f"Article Index: {article_index}, Similarity Score: {similarity}")

    
    
# Get top anti-similar articles for article at index 0
top_articles = get_topk_similar_articles(embd, target_article, k, negativSimilartiy=True)

print(f"\nTop {k} anti-similar Articles for Index: {target_article}:")
# Print the top similar articles
for article_index, similarity in top_articles:
    print(f"Article Index: {article_index}, Similarity Score: {similarity}")

Top 3 Articles for Index: 0:
Article Index: 2, Similarity Score: 0.7735193714512036
Article Index: 3, Similarity Score: 0.6222479909491638
Article Index: 1, Similarity Score: 0.5624545994965346

Top 3 anti-similar Articles for Index: 0:
Article Index: 5, Similarity Score: -0.8600980184222545
Article Index: 6, Similarity Score: -0.2821049463712549
Article Index: 4, Similarity Score: -0.27165084998091704
