# Using cosine similarity to find relevant pull requests and posts

## Function to compute cosine similarity

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(query:str, df: pd.DataFrame, text_column:str):
    """
    Compute cosine similarity between a query and a dataframe of text data
    """
    # Combine relevant text columns for vectorization
    df['combined_text'] = df[[text_column]].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
    query_vec = vectorizer.transform([query])

    # Calculate cosine similarity between the query and all documents
    return cosine_similarity(query_vec, tfidf_matrix).flatten()

## Load data

In [25]:
df_github = pd.read_csv('../../data/GH_data_safetensor.csv')
df_so = pd.read_csv('../../data/SO_data_safetensor.csv')

query = "model serialization safetensors"
cosine_similarities = compute_similarity(query, df_github, 'json content')
df_github['cosine_similarity'] = cosine_similarities


In [26]:

# sort by cosine similarity (descending)
df_github = df_github.sort_values(by='cosine_similarity', ascending=False)
# keep only the headers source, and json_content 
df_github = df_github[['source', 'json content']]
# rename json content to content
df_github = df_github.rename(columns={'json content': 'content'})

df_github.to_csv('../../data/GH_data_safetensor_similarity.csv', index=False)