In [229]:
# Cell 5: Clean & Validate the Transaction Amount Column
# Explanation:
# - This block ensures the 'transaction_amount' column is numeric.
# - If there are invalid entries (e.g., text, symbols), they are converted to NaN.
# - We fill missing or invalid values with the **median**, which is more robust than mean.

def clean_transaction_data(df: pd.DataFrame):
    # Convert to numeric, forcing errors to NaN
    df['transaction_amount'] = pd.to_numeric(df['transaction_amount'], errors='coerce')

    # Log the number of problematic entries
    missing_count = df['transaction_amount'].isnull().sum()
    if missing_count > 0:
        median_val = df['transaction_amount'].median()
        logging.warning(f"{missing_count} invalid or missing amounts. Filling with median: {median_val}")
        df['transaction_amount'].fillna(median_val, inplace=True)
    else:
        logging.info("No missing values in 'transaction_amount'.")

    return df

# Apply cleaning
df_cleaned = clean_transaction_data(df)

# Describe the cleaned data
df_cleaned.describe()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transaction_amount'].fillna(median_val, inplace=True)


Unnamed: 0.1,Unnamed: 0,Transaction_ID,Company_ID,Product_ID,Quantity,Product_Price,Total_Cost,transaction_amount,transaction_date
count,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,9000.0,0.0,10000
mean,4994.049111,5005.177556,50.583556,10.446778,10.528778,134652.802537,1424246.0,,2025-07-14 00:00:00
min,0.0,1.0,1.0,1.0,0.0,75613.362923,84000.0,,2025-07-14 00:00:00
25%,2501.75,2491.75,25.0,5.0,6.0,100686.369472,672000.0,,2025-07-14 00:00:00
50%,4997.5,5005.0,50.0,10.0,11.0,131297.783516,1344000.0,,2025-07-14 00:00:00
75%,7477.25,7527.25,76.0,15.0,16.0,162221.485713,1965600.0,,2025-07-14 00:00:00
max,9999.0,9999.0,100.0,20.0,21.0,246279.050335,4480000.0,,2025-07-14 00:00:00
std,2885.331476,2894.971964,28.90087,5.768341,5.808062,39058.106978,908627.4,,
