In [None]:
import pandas as pd

from constants import DATAFRAMES

# Data Cleaning

The goal is to fix or handle data quality issues identified in the validation stage.

NOTE: From data cleaning stage onwards, I will work with `clients.csv`, `trader.csv` and `trans.csv`. The `a2p_ref.csv` and `partner_codes.csv` are more suitable for partner attrition analysis which is not part of the assignment.

In [None]:
display(f'Initial record counts:')
for name, df in DATAFRAMES.items():
    display(f'{name}: {len(df)}')

## 1. Filter Invalid Transaction Amounts

In [None]:
invalid_amounts = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount <= 0]
display(f"Found {len(invalid_amounts)} transactions with amount <= 0")
display(invalid_amounts.head())

In [None]:
df_trans_clean = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount > 0].copy()
display(f"Removed {len(DATAFRAMES['df_trans']) - len(df_trans_clean)} transactions with invalid amounts")
display(f"Remaining transactions: {len(df_trans_clean)}")

## 2. Filter Invalid Registration Types

In [None]:
display("Registration types found:")
display(DATAFRAMES['df_clients'].type.value_counts())

In [None]:
valid_types = ['Full', 'Light', 'Demo']
invalid_types = DATAFRAMES['df_clients'][~DATAFRAMES['df_clients'].type.isin(valid_types)]
display(f"Found {len(invalid_types)} clients with invalid registration types")

In [None]:
df_clients_clean = DATAFRAMES['df_clients'][DATAFRAMES['df_clients'].type.isin(valid_types)].copy()
display(f"Removed {len(DATAFRAMES['df_clients']) - len(df_clients_clean)} clients with invalid registration types")
display(f"Remaining clients: {len(df_clients_clean)}")