# Finding Similar issues by semantics

In [2]:
#!pip install sentence-transformers scikit-learn pandas

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Loading github issues for transfomer library

In [4]:
pd.set_option('display.max_colwidth', None)

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/BAAI # Python/gitissues.csv")  # update this path

#df[['title', 'comments']].sample(10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      2175 non-null   int64 
 1   html_url        2175 non-null   object
 2   title           2175 non-null   object
 3   comments        2175 non-null   object
 4   body            2174 non-null   object
 5   comment_length  2175 non-null   int64 
 6   text            2175 non-null   object
dtypes: int64(2), object(5)
memory usage: 119.1+ KB


## Embedding the issues

In [13]:
def embed_issues(df, column_to_embed='title', model_name='all-MiniLM-L6-v2'):
    """
    Embeds the selected text column using SentenceTransformer.

    Parameters:
    - df: DataFrame containing the GitHub issues.
    - column_to_embed: Column name to embed ('title', 'body', etc.).
    - model_name: Pre-trained model name from sentence-transformers.

    Returns:
    - embeddings: numpy array of sentence embeddings.
    - model: the loaded SentenceTransformer model.
    """
    model = SentenceTransformer(model_name)
    texts = df[column_to_embed].fillna('').tolist()
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings, model

In [16]:
# Configuration
column = 'title'

# Step 1: Embed dataset
issue_embeddings, embed_model = embed_issues(df, column_to_embed=column)

In [15]:
issue_embeddings.shape

(2175, 384)

In [9]:
issue_embeddings[0]

array([ 1.25919003e-02,  2.79929694e-02,  2.98907328e-02, -2.82609425e-02,
        8.30339864e-02,  2.21100952e-02,  5.24233794e-03,  1.80427507e-02,
        2.48939320e-02,  4.96568047e-02, -5.40389717e-02,  3.42968479e-02,
        7.10182860e-02, -4.21273671e-02, -2.66767163e-02,  5.21195605e-02,
       -5.02375551e-02,  4.12489586e-02,  3.15378010e-02, -4.57764715e-02,
       -2.44398806e-02, -5.04018320e-03, -7.71648213e-02,  3.12126465e-02,
       -6.78871349e-02, -7.86000937e-02, -7.00689247e-03, -2.02241659e-04,
       -2.91259540e-03, -9.64545235e-02,  2.02910975e-02, -1.72545128e-02,
       -2.35775355e-02,  7.72400871e-02, -3.75692472e-02, -2.09935172e-03,
        1.67593379e-02, -2.95889992e-02, -5.15630236e-03, -1.38420546e-02,
       -1.93502288e-02,  8.89721364e-02,  2.37519555e-02, -6.59949854e-02,
        3.86768067e-03,  4.09940071e-02, -7.31461041e-04, -1.45085845e-02,
       -7.21288025e-02, -7.32265599e-03,  9.17877927e-02, -4.98494692e-02,
        4.10751291e-02,  

## Finding similar issues

In [17]:
def search_similar_issues(df, embeddings, model, problem_description, k=5, column_to_embed='title'):
    """
    Finds top-k similar issues from the dataset.

    Parameters:
    - df: DataFrame with GitHub issues.
    - embeddings: Precomputed issue embeddings.
    - model: SentenceTransformer model used for embedding.
    - problem_description: Query string to compare against dataset.
    - k: Number of top similar issues to return.
    - column_to_embed: For display context, can be title or body.

    Returns:
    - DataFrame with top-k similar issues and similarity scores.
    """
    query_embedding = model.encode([problem_description], convert_to_numpy=True)
    sim_scores = cosine_similarity(query_embedding, embeddings)[0]
    top_k_idx = np.argsort(sim_scores)[-k:][::-1]

    result_df = df.iloc[top_k_idx].copy()
    result_df['similarity'] = sim_scores[top_k_idx]
    return result_df[['title', 'similarity']]

## Seach similar problems

In [18]:
k = 10

query = "caching configuration not working"

# Step 2: Search similar issues
similar = search_similar_issues(df,
                                issue_embeddings,
                                embed_model,
                                query,
                                k=k,
                                column_to_embed=column)

similar

Unnamed: 0,title,similarity
1424,Possible caching bug,0.672567
1757,Caching doesn't work for map (non-deterministic),0.620773
150,Missing cache file,0.613573
151,Missing cache file,0.613573
349,Cached dataset not loaded,0.57833
346,Cached dataset not loaded,0.57833
348,Cached dataset not loaded,0.57833
347,Cached dataset not loaded,0.57833
1507,Caching processed dataset at wrong folder,0.567681
1509,Caching processed dataset at wrong folder,0.567681


In [19]:
k = 10

query = "Storage has failed"

# Step 2: Search similar issues
similar = search_similar_issues(df,
                                issue_embeddings,
                                embed_model,
                                query,
                                k=k,
                                column_to_embed=column)

similar

Unnamed: 0,title,similarity
855,Failure to save with save_to_disk,0.479105
985,Not enough disk space (Needed: Unknown size) when caching on a cluster,0.420456
984,Not enough disk space (Needed: Unknown size) when caching on a cluster,0.420456
982,Not enough disk space (Needed: Unknown size) when caching on a cluster,0.420456
983,Not enough disk space (Needed: Unknown size) when caching on a cluster,0.420456
779,OSError: Memory mapping file failed: Cannot allocate memory,0.411883
782,OSError: Memory mapping file failed: Cannot allocate memory,0.411883
781,OSError: Memory mapping file failed: Cannot allocate memory,0.411883
778,OSError: Memory mapping file failed: Cannot allocate memory,0.411883
777,OSError: Memory mapping file failed: Cannot allocate memory,0.411883


## Use the following embedding model and rebuild

https://huggingface.co/BAAI/bge-large-en-v1.5/tree/main