In [1]:
!pip install langchain

import os
import pandas as pd
from langchain.embeddings import FakeEmbeddings
from scipy import spatial

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import pickle


Collecting langchain
  Downloading langchain-0.0.229-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting openapi-schema-pydantic<2.0,>=1.2
  Using cached openapi_schema_pydantic-1.2.4-py3-none-any.whl (90 kB)
Collecting numexpr<3.0.0,>=2.8.4
  Using cached numexpr-2.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
Collecting pydantic<2,>=1
  Using cached pydantic-1.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Collecting langchainplus-sdk<0.0.21,>=0.0.20
  Using cached langchainplus_sdk-0.0.20-py3-none-any.whl (25 kB)
Collecting dataclasses-json<0.6.0,>=0.5.7
  Using cached dataclasses_json-0.5.9-py3-none-any.whl (26 kB)
Collecting typing-inspect>=0.4.0
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting marshmallow<4.0.0,>=3.3.0
  Using cached marshmallow-3.19.0-py3-none-any.whl (49 kB)
Collecting marsh

In [2]:
def get_topk_similar_articles(embeddings, article_index, k, anti_similarity=False):
    """
    Get the top k news articles with highest similarity for a given index
    
    embeddings: embeddings matrix
    article_index: target article
    k = get the top k articles
    anti_similarity: if anti similar values should be used
    return top articles for given index
    """

    # Calculate pairwise cosine similarity with sklearn library
    similarity_scores = cosine_similarity(embeddings)
    
    # get similarity scores for the given article index
    article_scores = similarity_scores[article_index]
        
    # get anti_similarity or positive ones
    # argsort returns the indices of the sorted array
    if anti_similarity:
        # [:k] to return only the top k article indices 
        sorted_indices = np.argsort(article_scores)[:k] 
    else:
        # [::-1] to order in decending order
        # 1:k+1 to exclude the target-article itself (similartiy = 1)
        sorted_indices = np.argsort(article_scores)[::-1][1:k+1] 
    
    # Get the top k (anti-) similar articles
    top_articles = [(index, article_scores[index]) for index in sorted_indices if index != article_index]
 
    return top_articles

In [3]:
def calculated_cosine_similarity(embeddings):
    ''' Calculate pairwise similarities and return them '''
    # Calculate pairwise cosine similarity with sklearn library
    similarity_scores = cosine_similarity(embeddings)
    return similarity_scores

def get_topk_similar_articles_precalculated(similarity_scores, article_index, k, anti_similarity=False):
    """
    Get the top k similar articles for a given article_index from the precomputed cos-similarties
    """
    
    # get similarity scores for the given article index
    article_scores = similarity_scores[article_index]
        
    # get anti_similarity or positive ones
    # argsort returns the indices of the sorted array
    if anti_similarity:
        # [:k] to return only the top k article indices 
        sorted_indices = np.argsort(article_scores)[:k] 
    else:
        # [::-1] to order in decending order
        # 1:k+1 to exclude the target-article itself (similartiy = 1)
        sorted_indices = np.argsort(article_scores)[::-1][1:k+1] 
    
    # Get the top k (anti-) similar articles
    top_articles = [(index, article_scores[index]) for index in sorted_indices if index != article_index]
 
    return top_articles

def get_similarty_for_two_articles(similarity_scores, article_index, compare_index):
    article_scores = similarity_scores[article_index]
    return article_scores[compare_index]

In [4]:
cleaned_articles = pd.read_csv("../data/cleaned_articles.csv", index_col=False).drop("Unnamed: 0", axis = 1)

study_articles = pd.read_csv("../data/rating_similarity.csv", index_col=False).drop("Unnamed: 0", axis = 1)

print(len(cleaned_articles))
print(len(study_articles))

5916
2955


In [5]:
cleaned_articles.loc[cleaned_articles['article_id'] == "FALTER_20151223BA00BC1175"]

Unnamed: 0,article_id,title,paragraphs,ressort,authors,date
14,FALTER_20151223BA00BC1175,DIE BLACK STREET BOYS,"['Harald Mahrer: Der Nachdenkliche', 'Der Älte...",Politik,"josef redl, barbara tóth",2015-12-23 00:00:00+00:00


In [6]:
'''
Get article_id's of compared articles for a specific article 
'''
def get_articles_for_base_article(base_article):
    list = study_articles.loc[study_articles['article_id'] == base_article]
    compared_article_ids = list.compared_article.values
    return compared_article_ids


# --------------------------------------------------------
def get_index_for_articleid(article_id):
    return cleaned_articles.loc[cleaned_articles['article_id'] == article_id].index.values[0]



# --------------------------------------------------------
def intersection(list_a, list_b):
    return [ e for e in list_a if e in list_b ]



# --------------------------------------------------------
# get list of based-articles
def get_base_articles():
        list = study_articles['article_id']
        return set(list)
#for item in base_articles:
#    falter_articles_ids = get_articles_for_base_article(item)

In [7]:

def intersection_calculation(embeddings_file, embeddings_file2=0, use_second_embeddings=True, num_articles=15):
    ''' Intersection calculation for embeddings with the labeled dataset '''
    # -> calculate embeddings
    # -> function to get index from article_id # get_index_for_articleid(article_id)
    # -> base_articles # get_base_articles()
    # -> compared_articles for a article()
    # -> similarities from embeddings
    # -> compare top 15 articles
    
    open_file = open(embeddings_file, "rb")
    embeddings = pickle.load(open_file)
    open_file.close()
    similarity_scores = calculated_cosine_similarity(embeddings)
    max_intersec = 0
    num_same_articles = 0
    
    if use_second_embeddings:
        base_articles = get_base_articles() # article_id of the articles that got compared

        for article in base_articles:
            labeled_dataset_articles = get_articles_for_base_article(article) # article ids for a base article
            recommended_articles_with_sim = get_topk_similar_articles_precalculated(similarity_scores, get_index_for_articleid(article), num_articles, False)

            indices_calculated = [item[0] for item in recommended_articles_with_sim]
            indices_labled_dataset = [get_index_for_articleid(item) for item in labeled_dataset_articles]
            intersec = intersection(indices_labled_dataset, indices_calculated)
            num_same_articles += len(intersec)
        
        max_intersec = (len(base_articles)*15)

    else:
        open_file = open(embeddings_file2, "rb")
        embeddings2 = pickle.load(open_file)
        open_file.close()
        similarity_scores2 = calculated_cosine_similarity(embeddings2)
        
        for i in range(5916):
            recommended_articles_with_sim = get_topk_similar_articles_precalculated(similarity_scores, i, num_articles, False)
            recommended_articles_with_sim2 = get_topk_similar_articles_precalculated(similarity_scores2, i, num_articles, False)
            indices_calculated = [item[0] for item in recommended_articles_with_sim]
            indices_calculated2 = [item[0] for item in recommended_articles_with_sim2]
            intersec = intersection(indices_calculated, indices_calculated2)
            num_same_articles += len(intersec)
            
        max_intersec = (5916*15)

    print('Number of articles that are in in the intersection: ', num_same_articles)
    coverage = "Intersection in percentage: {:.2f}%".format(100/max_intersec * num_same_articles)
    print(coverage)
    print("\n")

In [8]:
#197 Base articles => 197 * 15 = 2955 articles in theory 

print('OpenAI 80 Words:')
intersection_calculation("../data/openaiembed.pkl")
print('HuggingFace 80 Words:')
intersection_calculation("../data/huggingface_embeddings.pkl")
print('OpenAI 250 Words:')
intersection_calculation("../data/openaiembed_top250.pkl")
print('HuggingFace 250 Words:')
intersection_calculation("../data/huggingface_top250_embedding.pkl")

print('OpenAI 80 words vs OpenAI 250 words with 15 articles:')
intersection_calculation("../data/openaiembed_top250.pkl", "../data/openaiembed.pkl", False)

print('OpenAI 80 words vs OpenAI 250 words with 25 articles:')
intersection_calculation("../data/openaiembed_top250.pkl", "../data/openaiembed.pkl", False, 25)


OpenAI 80 Words:
Number of articles that are in in the intersection:  467
Intersection in percentage: 15.80%


HuggingFace 80 Words:
Number of articles that are in in the intersection:  251
Intersection in percentage: 8.49%


OpenAI 250 Words:
Number of articles that are in in the intersection:  619
Intersection in percentage: 20.95%


HuggingFace 250 Words:
Number of articles that are in in the intersection:  404
Intersection in percentage: 13.67%


OpenAI 80 words vs OpenAI 250 words with 15 articles:
Number of articles that are in in the intersection:  49964
Intersection in percentage: 56.30%


OpenAI 80 words vs OpenAI 250 words with 25 articles:
Number of articles that are in in the intersection:  85468
Intersection in percentage: 96.31%




In [9]:

def get_min_max_similarties(filename):
    open_file = open(filename, "rb")
    embeddings = pickle.load(open_file)
    open_file.close()

    similarity_scores = calculated_cosine_similarity(embeddings)

    # def get_topk_similar_articles_precalculated(similarity_scores, article_index, k, negativSimilartiy=False):


    min_sim = 1
    max_sim = 0
    for i in range(5916):
        list = get_topk_similar_articles_precalculated(similarity_scores, i, 1, True)
        sim = list[0][1]
        min_sim = min(min_sim, sim)
        list = get_topk_similar_articles_precalculated(similarity_scores, i, 2, False)
        sim = list[0][1]
        if sim < 1:
            max_sim = max(max_sim, sim)    

    print('Minimal cos-sim for all articles: ', min_sim)
    print('Maximal cos-sim for all articles: ', max_sim)


In [10]:
print("OpenAI Top80 Embeddings: ")
get_min_max_similarties("../data/openaiembed_top250.pkl")

print("\nOpenAI Top250 Embeddings: ")
get_min_max_similarties("../data/openaiembed.pkl")

print("\nHuggingFace Top80 Embeddings: ")
get_min_max_similarties("../data/huggingface_embeddings.pkl")

print("\nHuggingFace Top250 Embeddings: ")
get_min_max_similarties("../data/huggingface_top250_embedding.pkl")

OpenAI Top80 Embeddings: 
Minimal cos-sim for all articles:  0.7296835164144966
Maximal cos-sim for all articles:  0.9974947860618575

OpenAI Top250 Embeddings: 
Minimal cos-sim for all articles:  0.6979026654793344
Maximal cos-sim for all articles:  0.9999999999999998

HuggingFace Top80 Embeddings: 
Minimal cos-sim for all articles:  -0.11489994902211896
Maximal cos-sim for all articles:  0.9975477057357242

HuggingFace Top250 Embeddings: 
Minimal cos-sim for all articles:  0.26555460047357027
Maximal cos-sim for all articles:  0.999478314915158
