In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('github_issues.csv', low_memory=False)
df.describe()

issues_df = df[df['type'] == 'issue']
prs_df = df[df['type'] == 'pull_request']

print(f"Issues total records, {len(issues_df)}")
print(f"Pull requests total records, {len(prs_df)}")
print(f"Value counts for type column {df['type'].value_counts()}")

Issues total records, 70543
Pull requests total records, 29300
Value counts for type column type
issue           70543
pull_request    29300
Name: count, dtype: int64


In [3]:
# The number of PRs, that does require skip-news label and do not require skip-news label
skip_count = prs_df[prs_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]
non_skip_count = prs_df[~prs_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]

total_prs = len(prs_df)
skip_percentage = (len(skip_count) / total_prs) * 100
non_skip_percentage = (len(non_skip_count) / total_prs) * 100

print(f"PRs WITH skip news label: {len(skip_count)} ({skip_percentage:.1f}%)")
print(f"PRs WITHOUT skip news label: {len(non_skip_count)} ({non_skip_percentage:.1f}%)")
print(f"Total PRs: {total_prs}")


PRs WITH skip news label: 14967 (51.1%)
PRs WITHOUT skip news label: 14333 (48.9%)
Total PRs: 29300


In [4]:
# The number of Issues, that does require skip-news label and do not require skip-news label
skip_count = issues_df[issues_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]
non_skip_count = issues_df[~issues_df["labels"].fillna("").str.contains("skip news", case=False, na=False)]

total_prs = len(issues_df)
skip_percentage = (len(skip_count) / total_prs) * 100
non_skip_percentage = (len(non_skip_count) / total_prs) * 100

print(f"Issues WITH skip news label: {len(skip_count)} ({skip_percentage:.1f}%)")
print(f"Issues WITHOUT skip news label: {len(non_skip_count)} ({non_skip_percentage:.1f}%)")
print(f"Total Issues: {total_prs}")


Issues WITH skip news label: 4 (0.0%)
Issues WITHOUT skip news label: 70539 (100.0%)
Total Issues: 70543


In [5]:
# Value counts for PRs
print(f"Value counts for PRs: {prs_df['pr_status'].value_counts()}")

Value counts for PRs: pr_status
merged    24017
open       5283
Name: count, dtype: int64


In [6]:
# Value counts for Issues
def analyze_github_issues(df):
    # Create a copy to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Convert date columns to datetime
    df.loc[:, 'created_at'] = pd.to_datetime(df['created_at'])
    df.loc[:, 'closed_at'] = pd.to_datetime(df['closed_at'])
    
    # Create month columns
    # df.loc[:, 'created_month'] = df['created_at'].fillna("12-12-1800").dt.to_period('M')
    # df.loc[:, 'closed_month'] = df['closed_at'].dt.to_period('M')
    
    # Calculate resolution time
    # df.loc[:, 'resolution_days'] = (df['closed_at'] - df['created_at']).dt.total_seconds() / (24 * 60 * 60)
    
    # Calculate monthly statistics
    # monthly_stats = df.groupby('created_month').agg({
    #     'id': 'count',
    #     'state': lambda x: (x == 'CLOSED').sum()
    # }).rename(columns={
    #     'id': 'created',
    #     'state': 'closed'
    # })
    
    # Calculate average resolution time
    avg_resolution_time = df[df['resolution_days'].notna()]['resolution_days'].mean()
    
    return monthly_stats, 12

def plot_issue_trends(monthly_stats, avg_resolution_time):
    # Set the style
    sns.set_style("whitegrid")
    
    # Create figure and axis objects with a single subplot
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[2, 1])
    
    # Plot created and closed issues
    monthly_stats['created'].plot(ax=ax1, label='Created', marker='o')
    monthly_stats['closed'].plot(ax=ax1, label='Closed', marker='o')
    
    ax1.set_title('GitHub Issues Trends')
    ax1.set_xlabel('Month')
    ax1.set_ylabel('Number of Issues')
    ax1.legend()
    
    # Plot ratio of closed to created issues
    ratio = monthly_stats['closed'] / monthly_stats['created']
    ratio.plot(ax=ax2, label='Closed/Created Ratio', marker='o')
    ax2.axhline(y=1, color='r', linestyle='--', label='1:1 Ratio')
    
    ax2.set_title(f'Closed/Created Ratio (Avg Resolution Time: {avg_resolution_time:.1f} days)')
    ax2.set_xlabel('Month')
    ax2.set_ylabel('Ratio')
    ax2.legend()
    
    plt.tight_layout()
    return fig

    

In [7]:
monthly_stats, avg_resolution_time = analyze_github_issues(issues_df)
fig = plot_issue_trends(monthly_stats, avg_resolution_time)
plt.show()


KeyError: 'resolution_days'

In [None]:
monthly_stats, avg_resolution_time = analyze_github_issues(prs_df)
fig = plot_issue_trends(monthly_stats, avg_resolution_time)
plt.show()


In [None]:
def create_analysis_visualizations(issues_data):
    # Convert to DataFrame
    df = pd.DataFrame(issues_data)
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['closed_at'] = pd.to_datetime(df['closed_at'])
    df['created_month'] = df['created_at'].dt.to_period('M')
    df['created_weekday'] = df['created_at'].dt.day_name()
    df['resolution_days'] = (df['closed_at'] - df['created_at']).dt.total_seconds() / (24 * 60 * 60)

    # Create figure with subplots
    fig = plt.figure(figsize=(20, 15))
    gs = fig.add_gridspec(3, 2, hspace=0.3)

    # 1. Issue Activity Heatmap
    ax1 = fig.add_subplot(gs[0, 0])
    monthly_activity = df.groupby(['created_month']).size().reset_index(name='count')
    monthly_activity['month_num'] = monthly_activity.index
    heatmap_data = monthly_activity.pivot_table(
        values='count', 
        index=monthly_activity['month_num'] // 12,
        columns=monthly_activity['month_num'] % 12 + 1,
    ).fillna(0)
    
    sns.heatmap(heatmap_data, cmap='YlOrRd', ax=ax1)
    ax1.set_title('Issue Activity Heatmap (Years × Months)')
    ax1.set_xlabel('Month')
    ax1.set_ylabel('Year')

    # 2. Resolution Time Distribution
    ax2 = fig.add_subplot(gs[0, 1])
    closed_issues = df[df['closed_at'].notna()]
    sns.histplot(data=closed_issues, x='resolution_days', bins=30, ax=ax2)
    ax2.set_title('Distribution of Issue Resolution Times')
    ax2.set_xlabel('Days to Resolution')
    ax2.set_ylabel('Number of Issues')

    # 3. Open vs Closed Issues Over Time
    ax3 = fig.add_subplot(gs[1, 0])
    df['cumulative_created'] = range(1, len(df) + 1)
    closed_cumulative = df[df['closed_at'].notna()].sort_values('closed_at')
    closed_cumulative['cumulative_closed'] = range(1, len(closed_cumulative) + 1)
    
    plt.plot(df['created_at'], df['cumulative_created'], label='Created')
    if not closed_cumulative.empty:
        plt.plot(closed_cumulative['closed_at'], closed_cumulative['cumulative_closed'], label='Closed')
    ax3.set_title('Cumulative Issues Over Time')
    ax3.set_xlabel('Date')
    ax3.set_ylabel('Number of Issues')
    ax3.legend()

    # 4. Weekly Pattern
    ax4 = fig.add_subplot(gs[1, 1])
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekday_counts = df['created_weekday'].value_counts().reindex(weekday_order)
    sns.barplot(x=weekday_counts.index, y=weekday_counts.values, ax=ax4)
    ax4.set_title('Issues Created by Day of Week')
    ax4.set_xticklabels(weekday_order, rotation=45)
    ax4.set_ylabel('Number of Issues')

    # 5. Label Distribution
    ax5 = fig.add_subplot(gs[2, 0])
    labels = [label for sublist in df['labels'] for label in sublist]
    label_counts = pd.Series(labels).value_counts()
    sns.barplot(x=label_counts.values, y=label_counts.index, ax=ax5)
    ax5.set_title('Distribution of Issue Labels')
    ax5.set_xlabel('Number of Issues')

    # 6. Monthly Velocity
    ax6 = fig.add_subplot(gs[2, 1])
    monthly_created = df.groupby('created_month').size()
    monthly_closed = df[df['closed_at'].notna()].groupby(df['closed_at'].dt.to_period('M')).size()
    monthly_velocity = pd.DataFrame({
        'Created': monthly_created,
        'Closed': monthly_closed
    }).fillna(0)
    
    monthly_velocity.plot(kind='bar', ax=ax6)
    ax6.set_title('Monthly Issue Velocity')
    ax6.set_xlabel('Month')
    ax6.set_ylabel('Number of Issues')
    ax6.tick_params(axis='x', rotation=45)
    ax6.legend()

    plt.tight_layout()
    return fig

In [None]:
fig = create_analysis_visualizations(issues_df)
plt.show()
