# Using cosine similarity to find relevant pull requests and posts

## Function to compute cosine similarity

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compute_similarity(query:str, df: pd.DataFrame, text_column:str):
    """
    Compute cosine similarity between a query and a dataframe of text data
    """
    # Combine relevant text columns for vectorization
    df['combined_text'] = df[[text_column]].fillna('').apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['combined_text'])
    query_vec = vectorizer.transform([query])

    # Calculate cosine similarity between the query and all documents
    return cosine_similarity(query_vec, tfidf_matrix).flatten()

## Load data

In [25]:
df_github = pd.read_csv('../../data/GH_data_safetensor.csv')
df_so = pd.read_csv('../../data/SO_data_safetensor.csv')

query = "model serialization safetensors"
cosine_similarities = compute_similarity(query, df_github, 'json content')
df_github['cosine_similarity'] = cosine_similarities


In [26]:

# sort by cosine similarity (descending)
df_github = df_github.sort_values(by='cosine_similarity', ascending=False)
# keep only the headers source, and json_content 
df_github = df_github[['source', 'json content']]
# rename json content to content
df_github = df_github.rename(columns={'json content': 'content'})

df_github.to_csv('../../data/GH_data_safetensor_similarity.csv', index=False)

In [ ]:
import sqlite3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('../../results/db_model_evolution.sqlite3')

# Create a cursor object
cursor = conn.cursor()

# Execute a SQL query
cursor.execute('SELECT id, source, content, is_true_positive, comments FROM model_evolution_post')

# fetch all the results in a data frame
rows = cursor.fetchall()
df = pd.DataFrame(rows, columns=['id','source', 'content', 'is_true_positive', 'comments'])


# Calculate cosine similarity between the query and all documents
cosine_similarities = compute_similarity(query, df, 'content')

# sort by cosine similarity (descending)
df['cosine_similarity'] = cosine_similarities
df = df.sort_values(by='cosine_similarity', ascending=False)

# make index start at 1
df.index = df.index + 1

# keep only the headers source, and content

# df.to_csv('../../data/SO_data_safetensor_similarity.csv', index=False)
df


In [17]:
# check if index and id are all the same for all the rows
df['id'] == df.index
# now create aboolean variable that is true if all the values are the same
a = df['id'] == df.index
a.all()

True

In [20]:
# iterate over the rows 
for index, row in df.iterrows():
    _id = row['id']
    sim = row['cosine_similarity']
    print(f"UPDATE model_evolution_post SET cosine_similarity = {sim} WHERE id = {_id};")
    

UPDATE model_evolution_post SET cosine_similarity = 0.16593101008157318 WHERE id = 31;
UPDATE model_evolution_post SET cosine_similarity = 0.1347007825103304 WHERE id = 15;
UPDATE model_evolution_post SET cosine_similarity = 0.11723402218871434 WHERE id = 32;
UPDATE model_evolution_post SET cosine_similarity = 0.10876333032262149 WHERE id = 18;
UPDATE model_evolution_post SET cosine_similarity = 0.10246373163084865 WHERE id = 10;
UPDATE model_evolution_post SET cosine_similarity = 0.10158726223996317 WHERE id = 43;
UPDATE model_evolution_post SET cosine_similarity = 0.09654681252283923 WHERE id = 14;
UPDATE model_evolution_post SET cosine_similarity = 0.09582745981322377 WHERE id = 13;
UPDATE model_evolution_post SET cosine_similarity = 0.09325943841066588 WHERE id = 214;
UPDATE model_evolution_post SET cosine_similarity = 0.08692195256406066 WHERE id = 21;
UPDATE model_evolution_post SET cosine_similarity = 0.0855289867177106 WHERE id = 16;
UPDATE model_evolution_post SET cosine_simil