In [None]:
!pip install -U github-dependents-info
!pip install requests

Collecting github-dependents-info
  Downloading github_dependents_info-1.6.3-py3-none-any.whl (14 kB)
Collecting typer[all]<0.10,>=0.4 (from github-dependents-info)
  Downloading typer-0.9.4-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m485.3 kB/s[0m eta [36m0:00:00[0m
Collecting colorama<0.5.0,>=0.4.3 (from typer[all]<0.10,>=0.4->github-dependents-info)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: typer, colorama, github-dependents-info
  Attempting uninstall: typer
    Found existing installation: typer 0.12.3
    Uninstalling typer-0.12.3:
      Successfully uninstalled typer-0.12.3
Successfully installed colorama-0.4.6 github-dependents-info-1.6.3 typer-0.9.4


In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import spacy
import string
import gensim
import operator
import re
import requests
from github_dependents_info.gh_dependents_info import GithubDependentsInfo
import json

In [None]:
def fetch_github_issues(owner, repo):
    issues = []
    page =
    while page<=100:
        url = f"https://api.github.com/repos/{owner}/{repo}/issues?page={page}&state=all"
        response = requests.get(url)
        if response.status_code == 200:
            issues_page = response.json()
            if len(issues_page) == 0:
                break
            issues.extend(issues_page)
            page += 1
        else:
            print(f"Failed to fetch issues. Status code: {response.status_code}")
            return None
    return issues



In [None]:

# owner = "rust-lang"
# repo = "miri"
owner="DynamoRIO"
repo="dynamorio"


data=[]
data = fetch_github_issues(owner,repo)

print(len(data))

Failed to fetch issues. Status code: 403


TypeError: object of type 'NoneType' has no len()

In [None]:
# df_issues = pd.read_csv('./github-issues-10000.csv')
df_issues= pd.DataFrame(data)
df_issues.head()

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

spacy_nlp = spacy.load('en_core_web_sm')

#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def spacy_tokenizer(sentence):

    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)

    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)

    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)

    #creating token object
    tokens = spacy_nlp(sentence)

    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]

    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]

    #return tokens
    return tokens

In [None]:
print ('Cleaning and Tokenizing...')
# df_issues['content'] = df_issues['title']
df_issues['content'] = df_issues['title'] + " " +  df_issues['body'].apply(lambda x: x if isinstance(x, str) else "")
df_issues['issues_tokenized'] = df_issues['content'].map(lambda x: spacy_tokenizer(x))

df_issues.head()

In [None]:
issue_title= df_issues['issues_tokenized']
issue_title[0:5]

In [None]:
from gensim import corpora

#creating term dictionary
dictionary = corpora.Dictionary(issue_title)


#list of few which which can be further removed
stoplist = set('hello and issue commit hi the is a they are have in this on')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

In [None]:
dict_tokens = [[[dictionary[key], dictionary.token2id[dictionary[key]]] for key, value in dictionary.items() if key <= 50]]

In [None]:
corpus = [dictionary.doc2bow(desc) for desc in issue_title]

word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus[0:3]]

In [None]:
issue_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
issue_lsi_model = gensim.models.LsiModel(issue_tfidf_model[corpus], id2word=dictionary, num_topics=300)

In [None]:
gensim.corpora.MmCorpus.serialize('issue_tfidf_model_mm', issue_tfidf_model[corpus])
gensim.corpora.MmCorpus.serialize('issue_lsi_model_mm',issue_lsi_model[issue_tfidf_model[corpus]])

In [None]:
issue_tfidf_corpus = gensim.corpora.MmCorpus('issue_tfidf_model_mm')
issue_lsi_corpus = gensim.corpora.MmCorpus('issue_lsi_model_mm')

In [None]:
from gensim.similarities import MatrixSimilarity

issue_index = MatrixSimilarity(issue_lsi_corpus, num_features = issue_lsi_corpus.num_terms)

In [None]:
from operator import itemgetter

def search_similar_issues(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = issue_tfidf_model[query_bow]
    query_lsi = issue_lsi_model[query_tfidf]

    issue_index.num_best = 100

    issues_list = issue_index[query_lsi]

    issues_list.sort(key=itemgetter(1), reverse=True)
    issue_names = []
    for j, issue in enumerate(issues_list):

        issue_names.append (
            {
                'Number': df_issues['number'][issue[0]],
                'Relevance': round((issue[1] * 100),2),
                'Issue Title': df_issues['title'][issue[0]],
                # 'issue body': df_issues['body'][issue[0]],
                "Search Term" : search_term,
                'Issue url': df_issues['html_url'][issue[0]],
                "Issue State" : df_issues['state'][issue[0]],
                "Total Issues" : len(data)
                # 'Issue Status': df_issues[''][issue[0]]

            }

        )
        if round((issue[1] * 100),2)<20:
            break

    return pd.DataFrame(issue_names, columns=['Number','Relevance', "Search Term" ,"Total Issues",'Issue Title', "Issue State" ,'Issue url'])

In [None]:
def custom_agg(values):
    if len(values) > 1:
        return f"{values.iloc[0]}, \n {values.iloc[1]}"
    return values.iloc[0]


In [None]:
# pd.set_option('max_colwidth', None)
search_words =["non-determinism","nondeterministic", "non-deterministic","nondeterminism", "flaky","flakiness",]
word_index=0
all_result =pd.DataFrame([], columns=['Number','Relevance','Issue Title', "Issue State" ,'Issue url', "Search Term", "Total Issues", "Relevant Issues"])
for index,search_word in enumerate(search_words):
  result=search_similar_issues(search_word)
  result_number= len(result)
  result.loc[0, "Relevant Issues"] = f'{search_word}: {result_number}'
  all_result=pd.concat([all_result, result])
all_result['Search Term'] = all_result['Search Term'].fillna('').astype(str)
df_combined = all_result.groupby('Number').agg({
    'Search Term': lambda x: ', '.join(set(x)),
    'Issue Title': 'first',
    'Relevance': custom_agg,
    'Issue url': 'first',
    'Issue State': 'first',
    'Total Issues':'first',
    'Relevant Issues':custom_agg

}).reset_index()
df_combined = df_combined.sort_values(by=['Search Term','Relevance'], ascending=[True, False]).reset_index(drop=True)

print(df_combined)
df_combined.to_csv(f'{repo}-combined.csv')
all_result.to_csv(f'{repo}.csv')


In [None]:
# Added code blocks as break. The following codes check for cosine similarity to find similar words which can be used to find more issues

In [None]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')




In [None]:
import nltk
from gensim.models import KeyedVectors


# f = open('capitals.txt', 'r').read()
# set_words = set(nltk.word_tokenize(f))
# select_words = words = ['king', 'queen', 'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful']
# for w in select_words:
#     set_words.add(w)

def get_word_embeddings(embeddings):

    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    return word_embeddings


In [None]:
def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        cos: numerical number representing the cosine similarity between A and B.
    '''

    ### START CODE HERE ###
    dot = np.dot(A,B)
    norma = np.linalg.norm(A)
    normb = np.linalg.norm(B)
    cos = dot/(norma*normb)

    ### END CODE HERE ###
    return cos

In [None]:

def euclidean(A, B):
    """
    Input:
        A: a numpy array which corresponds to a word vector
        B: A numpy array which corresponds to a word vector
    Output:
        d: numerical number representing the Euclidean distance between A and B.
    """

    ### START CODE HERE ###

    # euclidean distance
    d = np.sum(np.linalg.norm(B-A))

    ### END CODE HERE ###

    return d

In [None]:

def get_similar(current_word, embeddings, cosine_similarity=cosine_similarity):
    key_exists = any(current_word in obj for index, obj in enumerate(embeddings.index_to_key))
    if not key_exists:
      return []
    word_embedding = embeddings[current_word]
    similarity = -1

    similar_words = [current_word]

    for index, word in enumerate(embeddings.index_to_key):
        if word != current_word:
            cur_similarity = cosine_similarity(embeddings[word], word_embedding)
            if cur_similarity > 0.55:
                similar_words.append(word)

    return similar_words

In [None]:
search_text="determine"
%time similar_words= get_similar(search_text, wv)
print(f"similar words: {similar_words}")
all_issues = pd.DataFrame(columns=['Relevance','issue Title', 'issue url'])
if(len(similar_words)>1):
  for text in similar_words:
    new_issues=search_similar_issues(text)
    if len(new_issues)>0:
      all_issues=pd.concat([all_issues,new_issues], ignore_index=True)
else:
  all_issues=search_similar_issues(search_text)
duplicated_sorted_issues = all_issues.sort_values(by='Relevance',ascending=False)
sorted_issues= duplicated_sorted_issues.drop_duplicates()
print(sorted_issues.to_string())

# for word in wv:
# print(wv.vocab)

CPU times: user 57.2 s, sys: 1.01 s, total: 58.2 s
Wall time: 1min 5s
similar words: ['determine', 'decide', 'determined', 'unclear', 'assess', 'evaluate', 'examine', 'determining', 'analyze', 'assessing', 'calculate', 'determines', 'pinpoint', 'ascertain', 'discern', 'Determining', 'Determine', 'ascertaining', 'determing', 'todetermine']
    Relevance issue Title issue url   Number  Search Term  Total Issues                                                                                                                                                                                                                                                                                                                     Issue Title Issue State                                          Issue url
49      92.46         NaN       NaN  84164.0      analyze        1500.0                                                                                                                                     