In [10]:
import os
import pandas as pd
import logging

In [11]:
# Set up logging
logging.basicConfig(
    filename='data_processing.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

In [12]:
# Define folder paths
raw_data_folder = 'raw_data'
baseline_data_folder = 'baseline_data'
output_folder = 'processed_data'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# List all raw data files
raw_files = [f for f in os.listdir(raw_data_folder) if f.endswith('.csv')]

In [13]:
# Process each raw data file
for raw_file in raw_files:
    try:
        # Construct corresponding baseline file name
        base_name = os.path.splitext(raw_file)[0]
        baseline_file = f"{base_name}_baseline.csv"

        # Paths for raw and baseline files
        raw_file_path = os.path.join(raw_data_folder, raw_file)
        baseline_file_path = os.path.join(baseline_data_folder, baseline_file)

        # Check if the baseline file exists
        if not os.path.exists(baseline_file_path):
            logging.warning(f"Baseline file missing for {raw_file}. Skipping.")
            continue

        # Load raw and baseline data
        raw_data = pd.read_csv(raw_file_path)
        baseline_data = pd.read_csv(baseline_file_path)

        # Ensure data integrity
        if raw_data.empty or baseline_data.empty:
            logging.warning(f"File is empty: {raw_file} or {baseline_file}. Skipping.")
            continue
        if not all(raw_data.columns == baseline_data.columns):
            logging.warning(f"Column mismatch between {raw_file} and {baseline_file}. Skipping.")
            continue

        # Compute column-wise mean of baseline data
        baseline_mean = baseline_data.mean()

        # Subtract baseline mean from raw data
        adjusted_data = raw_data - baseline_mean

        # Save the adjusted data to the output folder
        output_file_path = os.path.join(output_folder, raw_file)
        adjusted_data.to_csv(output_file_path, index=False)
        logging.info(f"Processed and saved: {output_file_path}")

    except Exception as e:
        # Log any unexpected errors during processing
        logging.error(f"Error processing {raw_file}: {e}")

print("All files processed. Check 'data_processing.log' for details.")

All files processed. Check 'data_processing.log' for details.


In [14]:
baseline_data.mean()

MQ2       35.212245
MQ3      228.822449
MQ4       24.602041
MQ5       16.284694
MQ6       25.991837
MQ7       99.993878
MQ8      132.007143
MQ9       87.785714
MQ135     21.271429
MQ137    173.456122
dtype: float64