In [1]:
# Import necessary libraries
import logging
import pandas as pd
import numpy as np

In [2]:
# Step 1: Configure Logging
# Set up logging configuration to capture errors in a log file and console
logging.basicConfig(
    level=logging.INFO,  # Set to INFO; adjust to ERROR or WARNING as needed
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("preprocessing_errors.log"),  # Log file
        logging.StreamHandler()  # Console output
    ]
)

In [3]:

# Function to log and raise errors
def log_error(message, raise_exception=False):
    logging.error(message)
    if raise_exception:
        raise ValueError(message)

In [4]:
# Step 2: Sample Data Loading and Preprocessing
def load_data():
    # Load sample data (for demonstration purposes, we create a small DataFrame)
    data = {
        'A': [1, 2, np.nan, 4],
        'B': ['x', 'y', 'z', None],
        'C': [10, 20, 30, 40]
    }
    df = pd.DataFrame(data)
    logging.info("Data loaded successfully.")
    return df

def preprocess_data(df):
    try:
        # Check for missing values and log them
        if df.isnull().values.any():
            missing_info = df.isnull().sum().to_dict()
            log_error(f"Missing values found: {missing_info}")

        # Attempt to convert all columns to integers, logging any errors
        for col in df.columns:
            if df[col].dtype == 'object':
                continue  # Skip non-numeric columns
            try:
                df[col] = df[col].astype(int)
            except ValueError as e:
                log_error(f"Conversion error in column '{col}': {e}")

        # Simulate another preprocessing step
        # Example: Check if column A values are within a specific range
        if not ((df['A'] >= 1) & (df['A'] <= 5)).all():
            log_error("Values in column 'A' are out of the expected range (1-5)")

        logging.info("Data preprocessing completed successfully.")
    except Exception as e:
        logging.critical(f"An unexpected error occurred during preprocessing: {e}")
        raise e


In [5]:

# Step 3: Execute the Loading and Preprocessing
try:
    df = load_data()  # Load data
    preprocess_data(df)  # Run preprocessing with error logging
except Exception as e:
    logging.critical(f"Pipeline halted due to an error: {e}")

# Display the resulting DataFrame
df

2024-11-04 10:25:10,288 - INFO - Data loaded successfully.
2024-11-04 10:25:10,300 - ERROR - Missing values found: {'A': 1, 'B': 1, 'C': 0}
2024-11-04 10:25:10,304 - ERROR - Conversion error in column 'A': Cannot convert non-finite values (NA or inf) to integer
2024-11-04 10:25:10,310 - ERROR - Values in column 'A' are out of the expected range (1-5)
2024-11-04 10:25:10,313 - INFO - Data preprocessing completed successfully.


Unnamed: 0,A,B,C
0,1.0,x,10
1,2.0,y,20
2,,z,30
3,4.0,,40
