In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import os

def analyze_gaps(df, column='pm2_5'):
    """
    Analyzes gaps in the specified DataFrame column and returns the distribution of their lengths.

    Parameters:
    - df: DataFrame with data
    - column: name of the column to analyze (default 'pm2_5')

    Returns:
    - pd.Series with the count of gaps for each length
    """
    gaps = []
    current_gap = 0
    
    # Iterate through the column and count the lengths of gaps
    for value in df[column].isna():
        if value:  # If the value is NaN
            current_gap += 1
        elif current_gap > 0:  # If the gap has ended
            gaps.append(current_gap)
            current_gap = 0
    
    # Account for the last gap, if it exists
    if current_gap > 0:
        gaps.append(current_gap)
    
    # Convert to a Series with the frequency count of each length
    gap_lengths = pd.Series(gaps).value_counts().sort_index()
    return gap_lengths

def plot_gap_distribution(gap_lengths, output_dir='plots'):
    """
    Plots a histogram of the gap length distribution and saves it.

    Parameters:
    - gap_lengths: pd.Series with the distribution of gap lengths
    - output_dir: directory to save the plot
    """
    plt.figure(figsize=(10, 6))
    gap_lengths.plot(kind='bar', color='skyblue')
    plt.xlabel('Gap length (hours)')
    plt.ylabel('Number')
    # plt.title('Distribution of Gap Lengths in pm2_5')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    # Create the directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save the plot
    plt.savefig(os.path.join(output_dir, 'gap_distribution.png'), dpi=600)
    plt.close()

def analyze_pm2_5_gaps(csv_file='data.csv', column='pm2_5', output_dir='plots'):
    """
    Main script for analyzing gaps in pm2_5 from a CSV file.

    Parameters:
    - csv_file: path to the CSV file with data
    - column: name of the column to analyze (default 'pm2_5')
    - output_dir: directory to save the results
    """
    # 1. Load data
    try:
        df = pd.read_csv(csv_file)
        print(f"Data successfully loaded from '{csv_file}'. Shape: {df.shape}")
    except FileNotFoundError:
        print(f"Error: file '{csv_file}' not found.")
        return
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Check for the existence of the column
    if column not in df.columns:
        print(f"Error: column '{column}' is missing in the data.")
        return

    # 2. Gap analysis
    gap_lengths = analyze_gaps(df, column)
    
    if gap_lengths.empty:
        print("No gaps found in the data.")
        return
    
    # 3. Print results as a table
    print("\nDistribution of Gap Lengths:")
    print("Gap Length | Count")
    print("-----------|-------")
    for length, count in gap_lengths.items():
        print(f"{length:>11} | {count:>7}")
    
    total_gaps = gap_lengths.sum()
    print(f"\nTotal number of gaps: {total_gaps}")
    print(f"Unique gap lengths: {len(gap_lengths)}")

    # 4. Save results to a file
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    gap_lengths.to_csv(os.path.join(output_dir, 'gap_lengths.csv'), header=['Count'], index_label='Gap Length')
    print(f"Results saved to '{output_dir}/gap_lengths.csv'.")

    # 5. Plot and save the histogram
    plot_gap_distribution(gap_lengths, output_dir)
    print(f"Histogram saved to '{output_dir}/gap_distribution.png'.")

def calculate_dataset_specifications(csv_file='df_data_prepared.csv', column='pm2_5'):
    """
    Calculates comprehensive statistics about the dataset and its gaps.
    
    Parameters:
    - csv_file: path to the CSV file with data
    - column: name of the column to analyze (default 'pm2_5')
    
    Returns:
    - Dictionary with dataset specifications
    """
    # Load data
    try:
        df = pd.read_csv(csv_file)
        print(f"Data loaded from '{csv_file}'. Shape: {df.shape}")
    except Exception as e:
        print(f"Error loading data: {e}")
        return None
    
    # Ensure date column is in datetime format
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
    
    # Basic dataset information
    start_date = df['date'].min()
    end_date = df['date'].max()
    total_days = (end_date - start_date).days + 1
    total_hours = total_days * 24
    
    # Missing values analysis
    missing_count = df[column].isna().sum()
    total_count = len(df)
    missing_percentage = (missing_count / total_count) * 100
    completeness_percentage = 100 - missing_percentage
    
    # Find gaps (consecutive NaN values)
    gaps = []
    current_gap = 0
    for value in df[column].isna():
        if value:  # If value is NaN
            current_gap += 1
        elif current_gap > 0:  # If gap just ended
            gaps.append(current_gap)
            current_gap = 0
    
    # Include the last gap if the series ends with NaN
    if current_gap > 0:
        gaps.append(current_gap)
    
    # Calculate gap distribution
    if gaps:
        short_gaps = [g for g in gaps if g <= 12]
        medium_gaps = [g for g in gaps if 12 < g <= 48]
        long_gaps = [g for g in gaps if g > 48]
        
        short_percentage = (len(short_gaps) / len(gaps)) * 100
        medium_percentage = (len(medium_gaps) / len(gaps)) * 100
        long_percentage = (len(long_gaps) / len(gaps)) * 100
        max_gap_length = max(gaps) if gaps else 0
    else:
        short_percentage = medium_percentage = long_percentage = 0
        max_gap_length = 0
    
    # Calculate monthly completeness
    if 'date' in df.columns:
        df['month'] = df['date'].dt.strftime('%Y-%m')
        monthly_stats = {}
        
        for month in df['month'].unique():
            month_data = df[df['month'] == month]
            month_total = len(month_data)
            month_missing = month_data[column].isna().sum()
            month_completeness = ((month_total - month_missing) / month_total) * 100
            monthly_stats[month] = month_completeness
            
        lowest_month = min(monthly_stats.items(), key=lambda x: x[1])
        highest_month = max(monthly_stats.items(), key=lambda x: x[1])
    else:
        monthly_stats = {}
        lowest_month = highest_month = ('Unknown', 0)
    
    # Compile results
    specifications = {
        'start_date': start_date,
        'end_date': end_date,
        'total_days': total_days,
        'total_hours': total_hours,
        'total_timestamps': total_count,
        'missing_values': missing_count,
        'missing_percentage': missing_percentage,
        'completeness_percentage': completeness_percentage,
        'total_gaps': len(gaps),
        'short_gaps_percentage': short_percentage,
        'medium_gaps_percentage': medium_percentage,
        'long_gaps_percentage': long_percentage,
        'max_gap_length': max_gap_length,
        'monthly_stats': monthly_stats,
        'lowest_completeness_month': lowest_month,
        'highest_completeness_month': highest_month
    }
    
    # Print a summary
    print("\n=== DATASET SPECIFICATIONS ===")
    print(f"Time period: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')} ({total_days} days)")
    print(f"Total hourly timestamps: {total_count}")
    print(f"Missing values: {missing_count} ({missing_percentage:.1f}%)")
    print(f"Overall completeness: {completeness_percentage:.1f}%")
    print(f"Total number of gaps: {len(gaps)}")
    print(f"Gap distribution: {short_percentage:.1f}% short (≤12h), "
          f"{medium_percentage:.1f}% medium (13-48h), {long_percentage:.1f}% long (>48h)")
    print(f"Longest gap: {max_gap_length} hours")
    print(f"Month with lowest completeness: {lowest_month[0]} ({lowest_month[1]:.1f}%)")
    print(f"Month with highest completeness: {highest_month[0]} ({highest_month[1]:.1f}%)")
    
    return specifications

if __name__ == "__main__":
    # Example script call
    analyze_pm2_5_gaps(csv_file='df_data_prepared.csv', column='pm2_5', output_dir='output_diagrams')

    # Add this line to run the new function
    specs = calculate_dataset_specifications(csv_file='df_data_prepared.csv', column='pm2_5')


Data successfully loaded from 'df_data_prepared.csv'. Shape: (5791, 12)

Distribution of Gap Lengths:
Gap Length | Count
-----------|-------
          3 |         35
          4 |         23
          5 |         24
          6 |         13
          7 |          9
          8 |          6
          9 |          9
         10 |          9
         11 |          4
         12 |          3
         13 |          3
         14 |          4
         15 |          4
         17 |          1
         19 |          1
         20 |          1
         21 |          2
         22 |          1
         25 |          1
         35 |          1
         48 |          1
         58 |          1
         67 |          1
         90 |          1
        191 |          1

Total number of gaps: 159
Unique gap lengths: 25
Results saved to 'output_diagrams/gap_lengths.csv'.
Histogram saved to 'output_diagrams/gap_distribution.png'.
Data loaded from 'df_data_prepared.csv'. Shape: (5791, 12)

=== DATASET S