In [None]:
# Setup and configuration
import analysis
import pandas as pd
import matplotlib.pyplot as plt
import os

# Initialize the analysis environment
helpers = analysis.initialize_notebook()
pp = helpers['pp']
summarize_dataset = helpers['summarize_dataset']
analyze_resource_utilization = helpers['analyze_resource_utilization']
plot_resource_utilization_scatter = helpers['plot_resource_utilization_scatter']
plot_resource_utilization_distribution = helpers['plot_resource_utilization_distribution']
suggest_resource_optimizations = helpers['suggest_resource_optimizations']

# Configuration - can be set via environment variables or modified directly
filepath = os.getenv("FILEPATH", "/tmp/merged.csv")
project_name = os.getenv("PROJECT_NAME", "my-project") 
credit_cost = float(os.getenv("CREDIT_COST", "0.0006"))

# Resource utilization thresholds
CPU_THRESHOLD = 40  # Jobs using less than 40% CPU on average
RAM_THRESHOLD = 40  # Jobs using less than 40% RAM on average
MIN_JOBS = 5        # Minimum number of runs for statistical significance

print(f"Configuration:")
print(f"  File: {filepath}")
print(f"  Project: {project_name}")
print(f"  Credit Cost: {credit_cost}")
print(f"  CPU Threshold: {CPU_THRESHOLD}%")
print(f"  RAM Threshold: {RAM_THRESHOLD}%")
print(f"  Minimum Jobs: {MIN_JOBS}")

: 

In [None]:
# Load and process data using the analysis library
df, project_dfs = analysis.load_circleci_data(
    filepath=filepath,
    project_name=project_name,
    credit_cost=credit_cost
)

# Extract project-specific datasets
all_jobs = project_dfs['all_jobs']
ps_jobs = project_dfs['ps_jobs']

print("Dataset Summary:")
print(summarize_dataset(all_jobs, "All jobs"))
print(summarize_dataset(ps_jobs, "Project-specific jobs"))

# Check availability of resource utilization data
resource_columns = ['MEDIAN_CPU_UTILIZATION_PCT', 'MEDIAN_RAM_UTILIZATION_PCT', 'RESOURCE_CLASS']
missing_columns = [col for col in resource_columns if col not in ps_jobs.columns]

if missing_columns:
    print(f"⚠️  Missing columns for resource analysis: {missing_columns}")
    print("Please ensure your data includes CPU/RAM utilization and resource class information")
else:
    resource_data_count = ps_jobs[
        ps_jobs['MEDIAN_CPU_UTILIZATION_PCT'].notna() & 
        ps_jobs['MEDIAN_RAM_UTILIZATION_PCT'].notna()
    ].shape[0]
    print(f"✅ Found {resource_data_count:,} jobs with resource utilization data")

In [None]:
# Perform comprehensive resource utilization analysis
resource_analysis = analyze_resource_utilization(
    ps_jobs, 
    cpu_threshold=CPU_THRESHOLD, 
    ram_threshold=RAM_THRESHOLD, 
    min_jobs=MIN_JOBS
)

if not resource_analysis:
    print("❌ Unable to perform resource analysis - no data available")
else:
    print("✅ Resource analysis completed successfully")
    
    # Display summary statistics
    total_jobs = len(resource_analysis['job_resource_stats'])
    underutilized_cpu_count = len(resource_analysis['underutilized_cpu'])
    underutilized_ram_count = len(resource_analysis['underutilized_ram'])
    underutilized_both_count = len(resource_analysis['underutilized_both'])
    
    print(f"\n📊 Resource Utilization Summary:")
    print(f"  Total job types analyzed: {total_jobs}")
    print(f"  Jobs with low CPU utilization (<{CPU_THRESHOLD}%): {underutilized_cpu_count}")
    print(f"  Jobs with low RAM utilization (<{RAM_THRESHOLD}%): {underutilized_ram_count}")
    print(f"  Jobs with low CPU AND RAM utilization: {underutilized_both_count}")

In [None]:
# Display jobs that are underutilizing both CPU and RAM
if 'underutilized_both' in resource_analysis and not resource_analysis['underutilized_both'].empty:
    underutilized_both = resource_analysis['underutilized_both']
    
    pp(underutilized_both[[
        'JOB_NAME',
        'RESOURCE_CLASS', 
        'MEDIAN_CPU_UTILIZATION_PCT_mean',
        'MEDIAN_RAM_UTILIZATION_PCT_mean',
        'MEDIAN_CPU_UTILIZATION_PCT_count',
        'COST_sum',
        'COST_mean',
        'JOB_RUN_SECONDS_mean'
    ]].head(20), 
    f"Jobs with <{CPU_THRESHOLD}% CPU AND <{RAM_THRESHOLD}% RAM utilization")
    
    # Calculate potential savings
    total_waste_cost = underutilized_both['COST_sum'].sum()
    print(f"\n💰 Total cost of underutilized jobs: ${total_waste_cost:.2f}")
    print(f"🎯 Potential optimization opportunity if resource classes were reduced")
    
else:
    print(f"✅ No jobs found that underutilize both CPU and RAM below {CPU_THRESHOLD}%/{RAM_THRESHOLD}%")

In [None]:
# Display jobs with low CPU utilization
if 'underutilized_cpu' in resource_analysis and not resource_analysis['underutilized_cpu'].empty:
    underutilized_cpu = resource_analysis['underutilized_cpu']
    
    pp(underutilized_cpu[[
        'JOB_NAME',
        'RESOURCE_CLASS',
        'MEDIAN_CPU_UTILIZATION_PCT_mean',
        'MEDIAN_RAM_UTILIZATION_PCT_mean', 
        'MEDIAN_CPU_UTILIZATION_PCT_count',
        'COST_sum'
    ]].head(15), 
    f"Jobs with <{CPU_THRESHOLD}% CPU utilization")
    
else:
    print(f"✅ No jobs found with CPU utilization below {CPU_THRESHOLD}%")

In [None]:
# Display jobs with low RAM utilization
if 'underutilized_ram' in resource_analysis and not resource_analysis['underutilized_ram'].empty:
    underutilized_ram = resource_analysis['underutilized_ram']
    
    pp(underutilized_ram[[
        'JOB_NAME',
        'RESOURCE_CLASS',
        'MEDIAN_CPU_UTILIZATION_PCT_mean',
        'MEDIAN_RAM_UTILIZATION_PCT_mean',
        'MEDIAN_CPU_UTILIZATION_PCT_count', 
        'COST_sum'
    ]].head(15), 
    f"Jobs with <{RAM_THRESHOLD}% RAM utilization")
    
else:
    print(f"✅ No jobs found with RAM utilization below {RAM_THRESHOLD}%")

In [None]:
# Analyze resource class usage patterns
if 'resource_class_stats' in resource_analysis:
    resource_class_stats = resource_analysis['resource_class_stats']
    
    pp(resource_class_stats[[
        'RESOURCE_CLASS',
        'UNIQUE_JOBS',
        'MEDIAN_CPU_UTILIZATION_PCT_mean',
        'MEDIAN_RAM_UTILIZATION_PCT_mean',
        'MEDIAN_CPU_UTILIZATION_PCT_std',
        'MEDIAN_RAM_UTILIZATION_PCT_std',
        'COST_sum'
    ]].sort_values('COST_sum', ascending=False), 
    "Resource Class Utilization Summary")
    
    print("\n📈 Interpretation:")
    print("- UNIQUE_JOBS: Number of different job types using this resource class")
    print("- *_mean: Average utilization across all jobs using this resource class") 
    print("- *_std: Standard deviation (higher = more variation in utilization)")
    print("- COST_sum: Total cost for all jobs using this resource class")

In [None]:
# Create scatter plot of CPU vs RAM utilization
if 'job_resource_stats' in resource_analysis:
    plot_resource_utilization_scatter(
        resource_analysis['job_resource_stats'],
        title="CPU vs RAM Utilization by Job and Resource Class"
    )
    
    print("💡 Jobs in the bottom-left quadrant (low CPU AND low RAM) are prime candidates for resource class optimization")

In [None]:
# Plot CPU utilization distribution
if 'raw_resource_data' in resource_analysis:
    raw_data = resource_analysis['raw_resource_data']
    plot_resource_utilization_distribution(
        raw_data['MEDIAN_CPU_UTILIZATION_PCT'], 
        resource_type="CPU"
    )

In [None]:
# Plot RAM utilization distribution
if 'raw_resource_data' in resource_analysis:
    plot_resource_utilization_distribution(
        raw_data['MEDIAN_RAM_UTILIZATION_PCT'],
        resource_type="RAM"
    )

In [None]:
# Generate specific optimization recommendations
if 'underutilized_both' in resource_analysis:
    print("🎯 OPTIMIZATION RECOMMENDATIONS")
    print("=" * 50)
    
    suggest_resource_optimizations(
        resource_analysis['underutilized_both'], 
        cost_savings_threshold=10  # Focus on jobs costing at least $10
    )
    
    # Additional insights
    if not resource_analysis['underutilized_both'].empty:
        total_underutilized_cost = resource_analysis['underutilized_both']['COST_sum'].sum()
        avg_cpu_util = resource_analysis['underutilized_both']['MEDIAN_CPU_UTILIZATION_PCT_mean'].mean()
        avg_ram_util = resource_analysis['underutilized_both']['MEDIAN_RAM_UTILIZATION_PCT_mean'].mean()
        
        print(f"📊 SUMMARY INSIGHTS:")
        print(f"   • Average CPU utilization of underutilized jobs: {avg_cpu_util:.1f}%")
        print(f"   • Average RAM utilization of underutilized jobs: {avg_ram_util:.1f}%") 
        print(f"   • Total cost of underutilized jobs: ${total_underutilized_cost:.2f}")
        print(f"   • Potential savings from right-sizing: 20-40% of total cost")

In [None]:
# Analyze specific jobs of interest
# Modify JOB_NAME_PATTERN to focus on specific jobs
JOB_NAME_PATTERN = "test"  # Change this to analyze specific job patterns

if 'job_resource_stats' in resource_analysis:
    job_stats = resource_analysis['job_resource_stats']
    
    # Filter jobs matching pattern
    matching_jobs = job_stats[job_stats['JOB_NAME'].str.contains(JOB_NAME_PATTERN, case=False, na=False)]
    
    if not matching_jobs.empty:
        pp(matching_jobs[[
            'JOB_NAME',
            'RESOURCE_CLASS',
            'MEDIAN_CPU_UTILIZATION_PCT_mean',
            'MEDIAN_RAM_UTILIZATION_PCT_mean',
            'MAX_CPU_UTILIZATION_PCT_mean',
            'MAX_RAM_UTILIZATION_PCT_mean',
            'COST_sum',
            'JOB_RUN_SECONDS_mean'
        ]].sort_values('COST_sum', ascending=False),
        f"Jobs matching pattern: '{JOB_NAME_PATTERN}'")
    else:
        print(f"No jobs found matching pattern: '{JOB_NAME_PATTERN}'")
        print("Available job patterns:", job_stats['JOB_NAME'].str.extract(r'([a-zA-Z]+)')[0].value_counts().head())