In [8]:
# Installed necessary packages
!pip install ratelimit
!pip install openai

import requests
import csv
import pandas as pd
from ratelimit import limits, sleep_and_retry
import openai

# config for rate limiting
CALLS_ALLOWED = 2  # Number of API calls allowed
RATE_LIMIT_PERIOD = 60  # Time period in seconds

@sleep_and_retry
@limits(calls=CALLS_ALLOWED, period=RATE_LIMIT_PERIOD)
def enforce_rate_limit():
    ''' Ensures compliance with API rate limits '''
    return

def get_popular_python_repositories():
    # Retrieve most popular Python repositories from GitHub API- used similar filters as in bugsplainers repo
    url = "https://api.github.com/search/repositories?q=yield+language:python+archived:false&sort=stars&order=desc&per_page=10000000"

    payload = {}
    headers = {
        'Accept': 'application/vnd.github+json',
        'Authorization': 'Bearer  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',  # Replace with your GitHub token
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response = requests.request("GET", url, headers=headers, data=payload)
    if response.status_code == 200:
        data = response.json()
        return data["items"]
    else:
        print("Error:", response.status_code)
        return []

# get commits list for each repo
def get_repository_commits(repo_name, owner):
    # Fetch commits of a repository from GitHub API
    commits_url = f"https://api.github.com/repos/{owner}/{repo_name}/commits"

    payload = {}
    headers = {
        'User-Agent': 'request',
        'Accept': 'application/vnd.github+json',
        'Authorization': 'Bearer  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',  # Replace with your GitHub token
        'X-GitHub-Api-Version': '2022-11-28'
    }

    response=requests.request("GET", commits_url, headers=headers, data=payload)
    if response.status_code == 200:
        commits_data = response.json()
        return commits_data
    else:
        print("Error:", response.status_code)
        return []

# filter function to filter based on keyword list
# have also used lambda function in this as requested

def filter_commits_by_keywords(commits, keywords):
    # Filter commits by specified keywords
    contains_keywords = lambda commit: any(keyword in commit['commit']['message'].lower() for keyword in keywords) and 'yield' in commit['commit']['message'].lower()
    filtered_commits = filter(contains_keywords, commits)
    return list(filtered_commits)

def process_commits_and_generate_questions(rows):
    # Process commits and generate questions using OpenAI api
    for row in rows:
        commit_message = row['Commit Message']

        enforce_rate_limit()  # Enforce API rate limiting
        completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": f"Could you determine if the commit involves performance or memory optimization? -{commit_message}"}
            ]
        )
        row['GPT3 Response'] = completion.choices[0].message['content']

        # Interpret the response as Yes, No, or Maybe
        #as mentioned further tried to do definitive filtering based on chatgpt response
        response_lower = row['GPT3 Response'].lower()
        if 'yes' in response_lower:
            row['Interpreted Response'] = 'Yes'
        elif 'no' in response_lower:
            row['Interpreted Response'] = 'No'
        elif 'maybe' in response_lower:
            row['Interpreted Response'] = 'Maybe'
        else:
            row['Interpreted Response'] = 'Uncertain'

    return rows

def main():

    # Obtain most popular repositories and set up OpenAI API
    repositories = get_popular_python_repositories()
    openai.api_key = "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"  # Replace with your OpenAI API key
    #list of keywords - added yield
    keywords = ['perf', 'speed', 'accelerate', 'fast', 'slow', 'latency', 'unnecessary', 'contention', 'optimize', 'efficient', 'resource', 'memory', 'improve' , 'bandwidth' , 'network', 'overhead', 'crash', 'throughput', 'scale']

    all_rows = []

    for repo in repositories:
        repo_name = repo['name']
        owner = repo['owner']['login']
        commits = get_repository_commits(repo_name, owner)

        # Filter commits based on length and keywords- added both filters
        filtered_commit_length = [commit for commit in commits if len(commit['commit']['message']) < 100000000000]
        filtered_commits = filter_commits_by_keywords(filtered_commit_length, keywords)

        if filtered_commits:
            for commit in filtered_commits:
                row = {
                    'Repository': f"{owner}/{repo_name}",
                    'Commit Message': commit['commit']['message'],
                    'Commit URL': commit['html_url'],
                    'GPT3 Response': '',
                    'Interpreted Response':''
                }
                all_rows.append(row)

    all_rows_with_responses = process_commits_and_generate_questions(all_rows)

    # Write data to CSV- all output is stored in csv
    with open('output.csv', 'w', newline='') as csvfile:
        fieldnames = ['Repository', 'Commit Message', 'Commit URL', 'GPT3 Response', 'Interpreted Response']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_rows_with_responses)

if __name__ == "__main__":
    main()


