# Using cosine similarity to find relevant pull requests and posts

## Function to compute cosine similarity

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(query:str, df: pd.DataFrame, text_column:str):
    """
    Compute cosine similarity between a query and a dataframe of text data
    """
    # Combine relevant text columns for vectorization
    df['combined_text'] = df[[text_column]].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
    query_vec = vectorizer.transform([query])

    # Calculate cosine similarity between the query and all documents
    return cosine_similarity(query_vec, tfidf_matrix).flatten()

## Load data

In [25]:
df_github = pd.read_csv('../../data/GH_data_safetensor.csv')
df_so = pd.read_csv('../../data/SO_data_safetensor.csv')

query = "model serialization safetensors"
cosine_similarities = compute_similarity(query, df_github, 'json content')
df_github['cosine_similarity'] = cosine_similarities


In [26]:

# sort by cosine similarity (descending)
df_github = df_github.sort_values(by='cosine_similarity', ascending=False)
# keep only the headers source, and json_content 
df_github = df_github[['source', 'json content']]
# rename json content to content
df_github = df_github.rename(columns={'json content': 'content'})

df_github.to_csv('../../data/GH_data_safetensor_similarity.csv', index=False)

In [13]:
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('../../results/db_model_evolution.sqlite3')

# Create a cursor object
cursor = conn.cursor()

# Execute a SQL query
cursor.execute('SELECT id, source, content, is_true_positive, comments FROM model_evolution_post')

# fetch all the results in a data frame
rows = cursor.fetchall()
df = pd.DataFrame(rows, columns=['id','source', 'content', 'is_true_positive', 'comments'])


# Calculate cosine similarity between the query and all documents
cosine_similarities = compute_similarity(query, df, 'content')

# sort by cosine similarity (descending)
df['cosine_similarity'] = cosine_similarities
df = df.sort_values(by='cosine_similarity', ascending=False)

# make index start at 1
df.index = df.index + 1

# keep only the headers source, and content

df.to_csv('../../data/SO_data_safetensor_similarity.csv', index=False)
df

Unnamed: 0,id,source,content,is_true_positive,comments,combined_text,cosine_similarity
31,31,StackOverflow,"{""tags"": [""huggingface-transformers"", ""large-l...",1,,"{""tags"": [""huggingface-transformers"", ""large-l...",0.165931
15,15,StackOverflow,"{""tags"": [""huggingface-transformers"", ""hugging...",0,,"{""tags"": [""huggingface-transformers"", ""hugging...",0.134701
32,32,StackOverflow,"{""tags"": [""python"", ""huggingface-transformers""...",1,,"{""tags"": [""python"", ""huggingface-transformers""...",0.117234
18,18,StackOverflow,"{""tags"": [""pytorch"", ""nlp"", ""huggingface-trans...",1,,"{""tags"": [""pytorch"", ""nlp"", ""huggingface-trans...",0.108763
10,10,StackOverflow,"{""tags"": [""python"", ""large-language-model"", ""p...",1,,"{""tags"": [""python"", ""large-language-model"", ""p...",0.102464
...,...,...,...,...,...,...,...
132,132,StackOverflow,"{""tags"": [""tensorflow"", ""gcc"", ""infrastructure...",0,,"{""tags"": [""tensorflow"", ""gcc"", ""infrastructure...",0.000000
134,134,StackOverflow,"{""tags"": [""libtorch""], ""question_score"": 0, ""i...",0,,"{""tags"": [""libtorch""], ""question_score"": 0, ""i...",0.000000
136,136,StackOverflow,"{""tags"": [""c++"", ""tensorflow"", ""eigen""], ""ques...",0,,"{""tags"": [""c++"", ""tensorflow"", ""eigen""], ""ques...",0.000000
137,137,StackOverflow,"{""tags"": [""tensorflow"", ""gradient-descent""], ""...",0,,"{""tags"": [""tensorflow"", ""gradient-descent""], ""...",0.000000
