In [32]:
from getpass import getpass

ACCESS_TOKEN = getpass("GitHub Access token")

GitHub Access token ········


In [45]:
import requests
import pandas as pd
from datetime import datetime, timedelta

# Set up GitHub API
GITHUB_API_URL = "https://api.github.com"
ORG_NAME = "serlo"
REPOS_ENDPOINT = f"{GITHUB_API_URL}/orgs/{ORG_NAME}/repos"

# Create headers with access token
headers = {
    "Authorization": f"Bearer {ACCESS_TOKEN}"
}

# Function to get all repositories of the organization
def get_repos():
    repos = []
    page = 1
    while True:
        response = requests.get(REPOS_ENDPOINT, headers=headers, params={"page": page, "per_page": 100})
        if response.status_code != 200:
            print(f"Failed to fetch repositories: {response.status_code}")
            print(response.text)
            break
        response_data = response.json()
        if not response_data:
            break
        repos.extend(response_data)
        page += 1
    return repos

# Function to get all pull requests for a given repository
def get_pull_requests(repo_name):
    PULLS_ENDPOINT = f"{GITHUB_API_URL}/repos/{ORG_NAME}/{repo_name}/pulls"
    prs = []
    page = 1
    while True:
        response = requests.get(PULLS_ENDPOINT, headers=headers, params={"state": "all", "page": page, "per_page": 100})
        if response.status_code != 200:
            print(f"Failed to fetch pull requests for {repo_name}: {response.status_code}")
            break
        response_data = response.json()
        if not response_data:
            break
        prs.extend(response_data)
        page += 1
    return prs

# Function to get detailed information about a PR
def get_pr_details(repo_name, pr_number):
    PR_URL = f"{GITHUB_API_URL}/repos/{ORG_NAME}/{repo_name}/pulls/{pr_number}"
    response = requests.get(PR_URL, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch PR details for PR #{pr_number} in {repo_name}: {response.status_code}")
        return None
    pr_data = response.json()
    
    # Extracting required details
    commits_count = pr_data['commits']
    additions = pr_data['additions']
    deletions = pr_data['deletions']
    
    # Get commits details
    commits_url = pr_data['commits_url']
    commits_response = requests.get(commits_url, headers=headers)
    if commits_response.status_code != 200:
        print(f"Failed to fetch commits for PR #{pr_number} in {repo_name}: {commits_response.status_code}")
        return None
    commits_data = commits_response.json()
    
    first_commit_date = commits_data[0]['commit']['author']['date'] if commits_data else None
    last_commit_date = commits_data[-1]['commit']['author']['date'] if commits_data else None

    return {
        "title": pr_data['title'],
        "author": pr_data['user']['login'],
        "url": pr_data["url"],
        "repo_name": repo_name,
        "created_at": pr_data['created_at'],
        "commits_count": commits_count,
        "additions": additions,
        "deletions": deletions,
        "total_changes": additions + deletions,
        "first_commit_date": first_commit_date,
        "last_commit_date": last_commit_date
    }

# Calculate the date 9 months ago
nine_months_ago = datetime.now() - timedelta(days=9*30)
since_date = nine_months_ago.isoformat()

all_pr_details = []

repos = get_repos()
print(f"Found {len(repos)} repositories in the organization {ORG_NAME}.")

for repo in repos:
    repo_name = repo['name']
    all_prs = get_pull_requests(repo_name)
    print(f"Found {len(all_prs)} pull requests in the repository {repo_name}.")

    prs = [pr for pr in all_prs if pr["created_at"] >= since_date]
    print(f"Found {len(prs)} pull requests in the repository {repo_name} after {since_date}.")
    
    for pr in prs:
        pr_details = get_pr_details(repo_name, pr['number'])
        if pr_details:
            all_pr_details.append(pr_details)

# Create a pandas DataFrame
df = pd.DataFrame(all_pr_details)

# Save DataFrame to a CSV file
df.to_csv(f"/tmp/{ORG_NAME}_pr_details.csv", index=False)

df.head()

Found 109 repositories in the organization serlo.
Found 108 pull requests in the repository athene2-legacy.
Found 0 pull requests in the repository athene2-legacy after 2023-09-20T22:26:31.628412.
Found 0 pull requests in the repository athene2-scribbles.
Found 0 pull requests in the repository athene2-scribbles after 2023-09-20T22:26:31.628412.
Found 11 pull requests in the repository athene2-guide.
Found 0 pull requests in the repository athene2-guide after 2023-09-20T22:26:31.628412.
Found 19 pull requests in the repository athene2-editor.
Found 0 pull requests in the repository athene2-editor after 2023-09-20T22:26:31.628412.
Found 0 pull requests in the repository athene2-class-resolver.
Found 0 pull requests in the repository athene2-class-resolver after 2023-09-20T22:26:31.628412.
Found 1 pull requests in the repository athene2-versioning.
Found 0 pull requests in the repository athene2-versioning after 2023-09-20T22:26:31.628412.
Found 0 pull requests in the repository athene2-

Unnamed: 0,title,author,url,repo_name,created_at,commits_count,additions,deletions,total_changes,first_commit_date,last_commit_date
0,docs: add an obsolence notice to README.md,AndreasHuber,https://api.github.com/repos/serlo/stats.serlo...,stats.serlo.org,2024-01-19T09:46:54Z,1,3,0,3,2024-01-19T09:46:02Z,2024-01-19T09:46:02Z
1,Add deprecation note,hugotiburtino,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-27T04:11:16Z,1,4,0,4,2024-01-27T04:11:08Z,2024-01-27T04:11:08Z
2,fix: Remove reference to user_field from dbdump,hugotiburtino,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-08T03:20:57Z,1,1,3,4,2024-01-08T03:19:05Z,2024-01-08T03:19:05Z
3,fix(dbdump): remove deleted tables from tables...,AndreasHuber,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-04T12:03:15Z,4,11,6,17,2024-01-04T11:59:21Z,2024-01-08T03:00:11Z
4,update db-migrations to 0.9.0,AndreasHuber,https://api.github.com/repos/serlo/infrastruct...,infrastructure-env-production,2024-01-23T15:49:27Z,1,1,1,2,2024-01-23T15:48:15Z,2024-01-23T15:48:15Z


In [47]:
import time

all_pr_details2 = [pr for pr in all_pr_details if pr["repo_name"] != "local-dev-env"]

repos = get_repos()
print(f"Found {len(repos)} repositories in the organization {ORG_NAME}.")

for repo in repos:
    if any(pr["repo_name"] == repo["name"] for pr in all_pr_details2):
        continue
    if repo["updated_at"] <= since_date:
        continue
    
    repo_name = repo['name']
    all_prs = get_pull_requests(repo_name)
    print(f"Found {len(all_prs)} pull requests in the repository {repo_name}.")

    prs = [pr for pr in all_prs if pr["created_at"] >= since_date]
    print(f"Found {len(prs)} pull requests in the repository {repo_name} after {since_date}.")
    
    for pr in prs:
        pr_details = get_pr_details(repo_name, pr['number'])
        if pr_details:
            all_pr_details2.append(pr_details)

    time.sleep(30)

# Create a pandas DataFrame
df = pd.DataFrame(all_pr_details2)

# Save DataFrame to a CSV file
df.to_csv(f"/tmp/{ORG_NAME}_pr_details2.csv", index=False)

df.head()

Found 109 repositories in the organization serlo.
Found 415 pull requests in the repository athene2.
Found 0 pull requests in the repository athene2 after 2023-09-20T22:26:31.628412.
Found 25 pull requests in the repository PlantBuddies.
Found 0 pull requests in the repository PlantBuddies after 2023-09-20T22:26:31.628412.
Found 155 pull requests in the repository serlo-abc.
Found 0 pull requests in the repository serlo-abc after 2023-09-20T22:26:31.628412.
Found 0 pull requests in the repository mediawiki-parser.
Found 0 pull requests in the repository mediawiki-parser after 2023-09-20T22:26:31.628412.
Found 26 pull requests in the repository overview.serlo.org.
Found 0 pull requests in the repository overview.serlo.org after 2023-09-20T22:26:31.628412.
Found 0 pull requests in the repository vm-configuration.
Found 0 pull requests in the repository vm-configuration after 2023-09-20T22:26:31.628412.
Found 371 pull requests in the repository serlo.org-legacy.
Found 0 pull requests in t

Unnamed: 0,title,author,url,repo_name,created_at,commits_count,additions,deletions,total_changes,first_commit_date,last_commit_date
0,docs: add an obsolence notice to README.md,AndreasHuber,https://api.github.com/repos/serlo/stats.serlo...,stats.serlo.org,2024-01-19T09:46:54Z,1,3,0,3,2024-01-19T09:46:02Z,2024-01-19T09:46:02Z
1,Add deprecation note,hugotiburtino,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-27T04:11:16Z,1,4,0,4,2024-01-27T04:11:08Z,2024-01-27T04:11:08Z
2,fix: Remove reference to user_field from dbdump,hugotiburtino,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-08T03:20:57Z,1,1,3,4,2024-01-08T03:19:05Z,2024-01-08T03:19:05Z
3,fix(dbdump): remove deleted tables from tables...,AndreasHuber,https://api.github.com/repos/serlo/infrastruct...,infrastructure-images,2024-01-04T12:03:15Z,4,11,6,17,2024-01-04T11:59:21Z,2024-01-08T03:00:11Z
4,update db-migrations to 0.9.0,AndreasHuber,https://api.github.com/repos/serlo/infrastruct...,infrastructure-env-production,2024-01-23T15:49:27Z,1,1,1,2,2024-01-23T15:48:15Z,2024-01-23T15:48:15Z
