In [None]:
import pandas as pd

from constants import DATAFRAMES

# Data Cleaning

The goal is to fix or handle data quality issues identified in the validation stage.

NOTE: From data cleaning stage onwards, I will work with `clients.csv`, `trader.csv` and `trans.csv`. The `a2p_ref.csv` and `partner_codes.csv` are more suitable for partner attrition analysis which is not part of the assignment.

In [None]:
display(f'Initial record counts:')
for name, df in DATAFRAMES.items():
    display(f'{name}: {len(df)}')

## 1. Filter Invalid Transaction Amounts

In [None]:
invalid_amounts = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount <= 0]
display(f"Found {len(invalid_amounts)} transactions with amount <= 0")
display(invalid_amounts.head())

In [None]:
df_trans_clean = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount > 0].copy()
display(f"Removed {len(DATAFRAMES['df_trans']) - len(df_trans_clean)} transactions with invalid amounts")
display(f"Remaining transactions: {len(df_trans_clean)}")

## 2. Filter Invalid Registration Types

In [None]:
display("Registration types found:")
display(DATAFRAMES['df_clients'].type.value_counts())

In [None]:
valid_types = ['Full', 'Light', 'Demo']
invalid_types = DATAFRAMES['df_clients'][~DATAFRAMES['df_clients'].type.isin(valid_types)]
display(f"Found {len(invalid_types)} clients with invalid registration types")

In [None]:
df_clients_clean = DATAFRAMES['df_clients'][DATAFRAMES['df_clients'].type.isin(valid_types)].copy()
display(f"Removed {len(DATAFRAMES['df_clients']) - len(df_clients_clean)} clients with invalid registration types")
display(f"Remaining clients: {len(df_clients_clean)}")

## 3. Remove Orphaned Records - Referential Integrity

### 3.1 Remove traders with non-existent clients

In [None]:
orphaned_traders = DATAFRAMES['df_trader'][~DATAFRAMES['df_trader'].client.isin(df_clients_clean.client)]
display(f"Found {len(orphaned_traders)} trader records with non-existent clients")
display(f"Percentage: {len(orphaned_traders) / len(DATAFRAMES['df_trader']) * 100:.2f}%")

In [None]:
df_trader_clean = DATAFRAMES['df_trader'][DATAFRAMES['df_trader'].client.isin(df_clients_clean.client)].copy()
display(f"Removed {len(DATAFRAMES['df_trader']) - len(df_trader_clean)} orphaned trader records")
display(f"Remaining traders: {len(df_trader_clean)}")

### 3.2 Remove transactions with non-existent logins

In [None]:
orphaned_trans = df_trans_clean[~df_trans_clean.login.isin(df_trader_clean.login)]
display(f"Found {len(orphaned_trans)} transactions with non-existent logins")
display(f"Percentage: {len(orphaned_trans) / len(df_trans_clean) * 100:.2f}%")

In [None]:
df_trans_clean = df_trans_clean[df_trans_clean.login.isin(df_trader_clean.login)].copy()
display(f"Removed {len(orphaned_trans)} orphaned transaction records")
display(f"Remaining transactions: {len(df_trans_clean)}")

### 3.3 Verify first_deposit_id references

In [None]:
invalid_first_deposits = df_trader_clean[df_trader_clean.first_deposit_id.notna() & ~df_trader_clean.first_deposit_id.isin(df_trans_clean.transaction_id)]
display(f"Found {len(invalid_first_deposits)} traders with invalid first_deposit_id references")


In [None]:
if len(invalid_first_deposits) > 0:
    display("Setting invalid first_deposit_id to null")
    df_trader_clean.loc[
        (invalid_first_deposits.login.values == df_trader_clean.login.values) &
        (invalid_first_deposits.client.values == df_trader_clean.client.values) &
        (invalid_first_deposits.account_id.values == df_trader_clean.account_id.values), 'first_deposit_id'] = None
    display(f"Set {len(invalid_first_deposits)} first_deposit_id values to null")