In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('github_rust_issues.csv', low_memory=False)
# df.describe()

issues_df = df[df['type'] == 'issue']
prs_df = df[df['type'] == 'pull_request']

print(f"Issues total records, {len(issues_df)}")
print(f"Pull requests total records, {len(prs_df)}")
print(f"Value counts for type column {df['type'].value_counts()}")

Issues total records, 55917
Pull requests total records, 55744
Value counts for type column type
issue           55917
pull_request    55744
Name: count, dtype: int64


In [3]:
# The number of PRs, that does require skip-news label and do not require skip-news label
skip_count = prs_df[prs_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]
non_skip_count = prs_df[~prs_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]

total_prs = len(prs_df)
skip_percentage = (len(skip_count) / total_prs) * 100
non_skip_percentage = (len(non_skip_count) / total_prs) * 100

print(f"PRs WITH skip news label: {len(skip_count)} ({skip_percentage:.1f}%)")
print(f"PRs WITHOUT skip news label: {len(non_skip_count)} ({non_skip_percentage:.1f}%)")
print(f"Total PRs: {total_prs}")


PRs WITH skip news label: 0 (0.0%)
PRs WITHOUT skip news label: 55744 (100.0%)
Total PRs: 55744


In [4]:
# The number of Issues, that does require skip-news label and do not require skip-news label
skip_count = issues_df[issues_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]
non_skip_count = issues_df[~issues_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]

total_prs = len(issues_df)
skip_percentage = (len(skip_count) / total_prs) * 100
non_skip_percentage = (len(non_skip_count) / total_prs) * 100

print(f"Issues WITH skip news label: {len(skip_count)} ({skip_percentage:.1f}%)")
print(f"Issues WITHOUT skip news label: {len(non_skip_count)} ({non_skip_percentage:.1f}%)")
print(f"Total Issues: {total_prs}")


Issues WITH skip news label: 0 (0.0%)
Issues WITHOUT skip news label: 55917 (100.0%)
Total Issues: 55917


In [5]:
# Value counts for PRs
print(f"Value counts for PRs: {prs_df['pr_status'].value_counts()}")

Value counts for PRs: pr_status
merged    41514
open      14230
Name: count, dtype: int64


In [6]:
# Value counts for Issues
def analyze_github_data(df, data_type="Issues"):
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Convert date columns to datetime, handling NaT (Not a Time) values
    df.loc[:, 'created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
    df.loc[:, 'closed_at'] = pd.to_datetime(df['closed_at'], errors='coerce')
    
    # Only process rows where created_at is not null
    df = df[df['created_at'].notna()]
    
    # Create month columns
    df.loc[:, 'created_month'] = df['created_at'].dt.to_period('M')
    df.loc[:, 'closed_month'] = df['closed_at'].dt.to_period('M')
    
    # Calculate resolution time (only for items that have both created and closed dates)
    mask = df['closed_at'].notna() & df['created_at'].notna()
    df.loc[:, 'resolution_days'] = pd.NA
    df.loc[mask, 'resolution_days'] = (
        df.loc[mask, 'closed_at'] - df.loc[mask, 'created_at']
    ).dt.total_seconds() / (24 * 60 * 60)
    
    # Calculate monthly statistics
    monthly_stats = df.groupby('created_month').agg({
        'id': 'count',
        'state': lambda x: (x == 'CLOSED').sum()
    }).rename(columns={
        'id': 'created',
        'state': 'closed'
    })
    
    # Calculate average resolution time
    avg_resolution_time = df['resolution_days'].mean()
    
    return monthly_stats, avg_resolution_time

def plot_issue_trends(monthly_stats, avg_resolution_time):
    # Set the style
    sns.set_style("whitegrid")
    
    # Create figure and axis objects with a single subplot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[2, 1])
    
    # Plot created and closed issues
    monthly_stats['created'].plot(ax=ax1, label='Created', marker='o')
    monthly_stats['closed'].plot(ax=ax1, label='Closed', marker='o')
    
    ax1.set_title('GitHub Issues Trends')
    ax1.set_xlabel('Month')
    ax1.set_ylabel('Number of Issues')
    ax1.legend()
    
    # Plot ratio of closed to created issues
    ratio = monthly_stats['closed'] / monthly_stats['created']
    ratio.plot(ax=ax2, label='Closed/Created Ratio', marker='o')
    ax2.axhline(y=1, color='r', linestyle='--', label='1:1 Ratio')
    
    ax2.set_title(f'Closed/Created Ratio (Avg Resolution Time: {avg_resolution_time:.1f} days)')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('Ratio')
    ax2.legend()
    
    plt.tight_layout()
    return fig

    

In [7]:
monthly_stats, avg_resolution_time = analyze_github_data(issues_df)
fig = plot_issue_trends(monthly_stats, avg_resolution_time)
plt.show()


NameError: name 'analyze_github_issues' is not defined

In [None]:
monthly_stats, avg_resolution_time = analyze_github_issues(prs_df)
fig = plot_issue_trends(monthly_stats, avg_resolution_time)
plt.show()
