Title: Introduction to Data Aggregation
<br>
Objective: Understand the basic concepts of data aggregation and practice simple aggregation methods.

Task 1: Calculating Sum
<br>
Task: Use a numerical dataset containing columns like 'sales', 'profit', and 'quantity'.
Calculate the total sales for the dataset.
<br>
Steps:<br>
1. Load the dataset using pandas.<br>
2. Apply the .sum() function on the 'sales' column.<br>
3. Verify the result by manually summing a portion of the 'sales' values.

In [None]:
# Task 1: Calculating Sum
# This script calculates the total sales from a dataset

import pandas as pd
import numpy as np

def calculate_total_sales(file_path):
    """
    Calculate the total sales from a dataset.
    
    Parameters:
    file_path (str): Path to the dataset file
    
    Returns:
    float: Total sales value
    """
    try:
        # Step 1: Load the dataset using pandas
        print("Loading dataset from:", file_path)
        df = pd.read_csv(file_path)
        
        # Print dataset info
        print("\nDataset Information:")
        print(f"Number of rows: {df.shape[0]}")
        print(f"Number of columns: {df.shape[1]}")
        print("\nColumn names:", df.columns.tolist())
        
        # Print first few rows
        print("\nFirst 5 rows of the dataset:")
        print(df.head())
        
        # Check if 'sales' column exists
        if 'sales' not in df.columns:
            raise ValueError("The dataset does not contain a 'sales' column")
            
        # Step 2: Apply the .sum() function on the 'sales' column
        total_sales = df['sales'].sum()
        print(f"\nTotal sales calculated using .sum(): ${total_sales:.2f}")
        
        # Step 3: Verify the result by manually summing a portion of the 'sales' values
        # Let's verify with the first 10 rows
        sample_size = min(10, len(df))
        manual_sum = 0
        
        print(f"\nVerifying calculation with first {sample_size} rows:")
        for i in range(sample_size):
            print(f"Row {i+1}: ${df['sales'].iloc[i]:.2f}")
            manual_sum += df['sales'].iloc[i]
            
        print(f"\nManual sum of first {sample_size} rows: ${manual_sum:.2f}")
        print(f"Percentage of total: {(manual_sum / total_sales) * 100:.2f}%")
        
        return total_sales
        
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

if __name__ == "__main__":
    # Replace with your dataset path
    file_path = "data/sales_data.csv"
    
    # If the file doesn't exist, create a sample dataset for demonstration
    try:
        # Try to open the file to check if it exists
        with open(file_path, 'r') as f:
            pass
    except FileNotFoundError:
        print(f"File {file_path} not found. Creating a sample dataset for demonstration.")
        
        # Create directory if it doesn't exist
        import os
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        
        # Create a sample dataset with random values
        np.random.seed(42)  # For reproducibility
        num_rows = 100
        
        # Generate sample data
        data = {
            'product_id': [f'P{i:03d}' for i in range(1, num_rows + 1)],
            'product_name': [f'Product {i}' for i in range(1, num_rows + 1)],
            'sales': np.random.uniform(10, 1000, num_rows).round(2),
            'profit': np.random.uniform(-100, 500, num_rows).round(2),
            'quantity': np.random.randint(1, 50, num_rows)
        }
        
        sample_df = pd.DataFrame(data)
        sample_df.to_csv(file_path, index=False)
        print(f"Sample dataset created at {file_path}")
    
    # Calculate total sales
    total_sales = calculate_total_sales(file_path)
    
    if total_sales is not None:
        print(f"\nTask Completed: Total sales for the dataset is ${total_sales:.2f}")

Task 2: Calculating Mean<br>

Task: Calculate the average quantity sold.<br>
Steps:<br>
4. Load the dataset.<br>
5. Use the .mean() function on the 'quantity' column to find the average.<br>
6. Double-check by calculating the mean manually on a small selection.

In [None]:
import pandas as pd
import os

# Step 4: Load the dataset
# List available files
print("Available files:")
for file in os.listdir():
    if file.endswith('.csv'):
        print(f"- {file}")

# Use the first CSV file found or specify the filename
csv_files = [f for f in os.listdir() if f.endswith('.csv')]
if csv_files:
    filename = csv_files[0]
    print(f"\nUsing file: {filename}")
    df = pd.read_csv(filename)
    
    # Step 5: Use the .mean() function on the 'quantity' column to find the average
    average_quantity = df['quantity'].mean()
    print(f"Average quantity (using mean() function): {average_quantity}")
    
    # Step 6: Double-check by calculating the mean manually on a small selection
    sample = df['quantity'].head()  # Get first 5 rows
    manual_average = sum(sample) / len(sample)
    print("\nManual verification:")
    print(f"Sample values: {list(sample)}")
    print(f"Sum of values: {sum(sample)}")
    print(f"Count of values: {len(sample)}")
    print(f"Manual average: {manual_average}")
else:
    print("\nNo CSV files found in the current directory.")
    print("Please place a CSV file with a 'quantity' column in the current directory.")