# Finding Similar issues by semantics

In [2]:
#!pip install sentence-transformers scikit-learn pandas

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## Loading github issues for transfomer library

In [4]:
pd.set_option('display.max_colwidth', None)

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/BAAI # Python/gitissues.csv")  # update this path

#df[['title', 'comments']].sample(10)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2175 entries, 0 to 2174
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0      2175 non-null   int64 
 1   html_url        2175 non-null   object
 2   title           2175 non-null   object
 3   comments        2175 non-null   object
 4   body            2174 non-null   object
 5   comment_length  2175 non-null   int64 
 6   text            2175 non-null   object
dtypes: int64(2), object(5)
memory usage: 119.1+ KB


## Embedding the issues

In [6]:
def embed_issues(df, column_to_embed='title', model_name='all-MiniLM-L6-v2'):
    """
    Embeds the selected text column using SentenceTransformer.

    Parameters:
    - df: DataFrame containing the GitHub issues.
    - column_to_embed: Column name to embed ('title', 'body', etc.).
    - model_name: Pre-trained model name from sentence-transformers.

    Returns:
    - embeddings: numpy array of sentence embeddings.
    - model: the loaded SentenceTransformer model.
    """
    model = SentenceTransformer(model_name)
    texts = df[column_to_embed].fillna('').tolist()
    embeddings = model.encode(texts, convert_to_numpy=True)
    return embeddings, model

In [None]:
# Configuration
column = 'title'

# Step 1: Embed dataset
issue_embeddings, embed_model = embed_issues(df, column_to_embed=column)

In [None]:
issue_embeddings.shape

In [None]:
issue_embeddings[0]

## Finding similar issues

In [None]:
def search_similar_issues(df, embeddings, model, problem_description, k=5, column_to_embed='title'):
    """
    Finds top-k similar issues from the dataset.

    Parameters:
    - df: DataFrame with GitHub issues.
    - embeddings: Precomputed issue embeddings.
    - model: SentenceTransformer model used for embedding.
    - problem_description: Query string to compare against dataset.
    - k: Number of top similar issues to return.
    - column_to_embed: For display context, can be title or body.

    Returns:
    - DataFrame with top-k similar issues and similarity scores.
    """
    query_embedding = model.encode([problem_description], convert_to_numpy=True)
    sim_scores = cosine_similarity(query_embedding, embeddings)[0]
    top_k_idx = np.argsort(sim_scores)[-k:][::-1]

    result_df = df.iloc[top_k_idx].copy()
    result_df['similarity'] = sim_scores[top_k_idx]
    return result_df[['title', 'similarity']]

## Seach similar problems

In [None]:
k = 10

query = "caching configuration not working"

# Step 2: Search similar issues
similar = search_similar_issues(df,
                                issue_embeddings,
                                embed_model,
                                query,
                                k=k,
                                column_to_embed=column)

similar

In [None]:
k = 10

query = "Storage has failed"

# Step 2: Search similar issues
similar = search_similar_issues(df,
                                issue_embeddings,
                                embed_model,
                                query,
                                k=k,
                                column_to_embed=column)

similar

## Use the following embedding model and rebuild

https://huggingface.co/BAAI/bge-large-en-v1.5/tree/main