In [None]:
from collections import defaultdict
import concurrent.futures
from tqdm import tqdm
import pandas as pd
import requests
import hashlib
import json
import time

In [None]:
from getpass import getpass
access_token = getpass()

In [None]:
def get_forked_repositories(owner, repository, token=access_token):
    forks_url = f"https://api.github.com/repos/{owner}/{repository}/forks"
    
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }

    response = requests.get(forks_url, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch forks for {repository}: {response.status_code}")
        return []

def get_repos():
    forks_data = []

    langs = ["java", "go", "python", "scala"]
    for lang in langs:
        for i in range(1, 5 + 1):
            owner = 'central-university-dev'
            repository = f"2024-{i}-{lang}-backend-academy-2024-{lang}-template"
            
            forks = get_forked_repositories(owner, repository)
            
            for fork in forks:
                repo_data = {
                    'original_repository': repository,
                    'forked_repository': fork['full_name'],
                }
                forks_data.append(repo_data)
    
    df = pd.DataFrame(forks_data)
    df.to_csv('data/repos_with_cq_gate.csv', index=False)

    return df

In [None]:
class GitHubRepoParser:
    def __init__(self, csv_file, extensions_json):
        self.csv_file = csv_file
        self.extensions_json = extensions_json
        self.language_mapping = self.load_language_mapping()
        self.repos = self.load_repos()
        self.access_token = access_token
        self.num_req = 0

    def load_language_mapping(self):
        with open(self.extensions_json, 'r') as f:
            return json.load(f)

    def load_repos(self):
        return pd.read_csv(self.csv_file)

    def generate_unique_id(self, diff_hunk):
        return hashlib.md5(diff_hunk.encode('utf-8')).hexdigest()

    def get_with_retry(self, url, headers):
        """Function to handle HTTP GET requests with retry on 403 error."""
        while True:
            response = requests.get(url, headers=headers)
            if response.status_code == 403:
                print(f"403 error encountered. Waiting 15 minutes before retrying: {url}")
                time.sleep(900)
            else:
                return response

    def get_all_pages(self, url, headers):
        all_items = []
        while url:
            response = self.get_with_retry(url, headers)
            
            if response.status_code == 200:
                items = response.json()
                all_items.extend(items)
                if 'next' in response.links:
                    url = response.links['next']['url']
                else:
                    break
            else:
                print(f"Failed to fetch page: {response.status_code} {url}")
                break
        return all_items
    
    def get_commits_for_pr(self, repo, pull_number):
        commits_url = f"https://api.github.com/repos/{repo}/pulls/{pull_number}/commits"
        
        headers = {
            "Authorization": f"token {self.access_token}",
            "Accept": "application/vnd.github.v3+json"
        }
        
        return self.get_all_pages(commits_url, headers)

    def get_diff_between_commits(self, repo, base, head):
        compare_url = f"https://api.github.com/repos/{repo}/compare/{base}...{head}"
        
        headers = {
            "Authorization": f"token {self.access_token}",
            "Accept": "application/vnd.github.v3+json"
        }

        response = self.get_with_retry(compare_url, headers)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to compare commits: {base}...{head} with status {response.status_code}")
            return None

    def get_full_file_diffs_in_pr(self, repo, pull_number):
        commits = self.get_commits_for_pr(repo, pull_number)
        commit_diffs = []

        if not commits or len(commits) < 2:
            print(f"No enough commits to process for PR {pull_number}")
            return commit_diffs

        for i in range(len(commits)):
            for j in range(i + 1, len(commits)):
                base_commit_sha = commits[i]['sha']
                head_commit_sha = commits[j]['sha']
                
                diff_data = self.get_diff_between_commits(repo, base_commit_sha, head_commit_sha)
                file_diffs = {}

                if diff_data and 'files' in diff_data:
                    for file_info in diff_data['files']:
                        if 'patch' in file_info:
                            file_diffs[file_info['filename']] = file_info['patch']
                            
                            if file_info.get('status') == 'renamed':
                                file_diffs[file_info['previous_filename']] = file_info['patch']

                commit_diffs.append((head_commit_sha, file_diffs))
        
        return commit_diffs

    def get_pull_request_review_comments(self, repo, pull_number, original_repo):
        pr_comments_url = f"https://api.github.com/repos/{repo}/pulls/{pull_number}/comments"
        
        headers = {
            "Authorization": f"token {self.access_token}",
            "Accept": "application/vnd.github.v3.full+json"
        }

        comments = self.get_all_pages(pr_comments_url, headers)
        
        if comments:
            commit_diffs = self.get_full_file_diffs_in_pr(repo, pull_number)

            df_dict = defaultdict(list)
            for comment in comments:
                _, extension = comment["path"].rsplit('.', 1) if '.' in comment["path"] else (None, None)
                language = self.language_mapping.get(f".{extension}", "Unk")

                diff_id = self.generate_unique_id(comment["diff_hunk"])
                full_diff = ""

                diff = comment["diff_hunk"]
                
                for commit_sha, file_diffs in commit_diffs:
                    for file_path in file_diffs.values():
                        if diff in file_path:
                            full_diff = file_path


                full_diff_id = self.generate_unique_id(full_diff)

                df_dict['diff'].append(comment["diff_hunk"])
                df_dict['diff_id'].append(diff_id)
                df_dict['start_line'].append(comment["start_line"] if comment["start_line"] else -1)
                df_dict['end_line'].append(comment['line'] if comment["line"] else -1)
                df_dict['original_start_line'].append(comment["original_start_line"] if comment["original_start_line"] else -1)
                df_dict['original_end_line'].append(comment['original_line'] if comment["original_line"] else -1)
                df_dict['full_diff'].append(full_diff)
                df_dict['full_diff_id'].append(full_diff_id)
                df_dict['message'].append(comment["body"])
                df_dict['file'].append(comment["path"])
                df_dict['comment_url'].append(comment["html_url"])
                df_dict['language'].append(language)
                df_dict['author_association'].append(comment["author_association"])
                df_dict['repo_name'].append(repo)
                df_dict['original_repo_name'].append(original_repo)

            return pd.DataFrame(df_dict)
        else:
            return None

    def get_pull_info(self, repo):
        prs_url = f"https://api.github.com/repos/{repo}/pulls"
        
        headers = {
            "Authorization": f"token {self.access_token}",
            "Accept": "application/vnd.github.v3+json"
        }

        response = self.get_with_retry(prs_url, headers)
        
        if response.status_code == 200:
            pulls = response.json()
            return pulls
        else:
            print(f"Failed to fetch pull requests: {response.status_code}")
            return None
        
    def get_commit_info(self, repo, commit_sha):
        url = f"https://api.github.com/repos/{repo}/commits/{commit_sha}"
        headers = {
            "Authorization": f"token {self.access_token}",
            "Accept": "application/vnd.github.v3+json"
        }
        response = self.get_with_retry(url, headers)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to get commit info: {response.status_code}")
            return None

    def process_repo(self, row):
        try:
            repo = row['forked_repository']
            original_repo = row['original_repository']
            
            pulls_info = self.get_pull_info(repo)
            pull_comments_list = []
            
            for pull_info in pulls_info:
                pull_comments = self.get_pull_request_review_comments(repo, pull_info['number'], original_repo)
                if pull_comments is not None:
                    pull_comments_list.append(pull_comments)
            
            return pull_comments_list
        except Exception as e:
            print(f"Error processing repo: {e}")
            return []
    
    def parse_and_save_repos_in_batches(self, batch_size=30):
        num_batches = (len(self.repos) // batch_size) + (len(self.repos) % batch_size != 0)
        
        for batch in range(num_batches):
            batch_repos = self.repos.iloc[batch * batch_size : (batch + 1) * batch_size]
            
            result = []
            with concurrent.futures.ThreadPoolExecutor(10) as executor:
                futures = [executor.submit(self.process_repo, row) for _, row in batch_repos.iterrows()]

                for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures),
                                   desc=f"Processing batch {batch+1}/{num_batches}"):
                    result.extend(future.result())

            result = pd.concat(result, ignore_index=True)
            result.to_csv(f"repos_batch_{batch+1}.csv", index=False)

            print(f"Batch {batch+1} processed and saved.")