In [2]:
import pandas as pd

from constants import DATAFRAMES

# Data Cleaning

The goal is to fix or handle data quality issues identified in the validation stage.

NOTE: From data cleaning stage onwards, I will work with `clients.csv`, `trader.csv` and `trans.csv`. The `a2p_ref.csv` and `partner_codes.csv` are more suitable for partner attrition analysis which is not part of the assignment.

In [3]:
display(f'Initial record counts:')
for name, df in DATAFRAMES.items():
    display(f'{name}: {len(df)}')

'Initial record counts:'

'df_trans: 7855'

'df_trader: 7804'

'df_clients: 5327'

'df_partner_codes: 1592'

'df_a2p_ref: 7733'

## 1. Filter Invalid Transaction Amounts

In [4]:
invalid_amounts = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount <= 0]
display(f"Found {len(invalid_amounts)} transactions with amount <= 0")
display(invalid_amounts.head())

'Found 1 transactions with amount <= 0'

Unnamed: 0,login,created_at,transaction_id,amount,currency
1523,6085899,2025-01-31 11:25:00,17046055,-10000.0,JPY


In [5]:
df_trans_clean = DATAFRAMES['df_trans'][DATAFRAMES['df_trans'].amount > 0].copy()
display(f"Removed {len(DATAFRAMES['df_trans']) - len(df_trans_clean)} transactions with invalid amounts")
display(f"Remaining transactions: {len(df_trans_clean)}")

'Removed 1 transactions with invalid amounts'

'Remaining transactions: 7854'

## 2. Filter Invalid Registration Types

In [6]:
display("Registration types found:")
display(DATAFRAMES['df_clients'].type.value_counts())

'Registration types found:'

type
Full     4530
Demo      431
Light     364
0           2
Name: count, dtype: int64

In [7]:
valid_types = ['Full', 'Light', 'Demo']
invalid_types = DATAFRAMES['df_clients'][~DATAFRAMES['df_clients'].type.isin(valid_types)]
display(f"Found {len(invalid_types)} clients with invalid registration types")

'Found 2 clients with invalid registration types'

In [8]:
df_clients_clean = DATAFRAMES['df_clients'][DATAFRAMES['df_clients'].type.isin(valid_types)].copy()
display(f"Removed {len(DATAFRAMES['df_clients']) - len(df_clients_clean)} clients with invalid registration types")
display(f"Remaining clients: {len(df_clients_clean)}")

'Removed 2 clients with invalid registration types'

'Remaining clients: 5325'

## 3. Remove Orphaned Records - Referential Integrity

### 3.1 Remove traders with non-existent clients

In [9]:
orphaned_traders = DATAFRAMES['df_trader'][~DATAFRAMES['df_trader'].client.isin(df_clients_clean.client)]
display(f"Found {len(orphaned_traders)} trader records with non-existent clients")
display(f"Percentage: {len(orphaned_traders) / len(DATAFRAMES['df_trader']) * 100:.2f}%")

'Found 40 trader records with non-existent clients'

'Percentage: 0.51%'

In [10]:
df_trader_clean = DATAFRAMES['df_trader'][DATAFRAMES['df_trader'].client.isin(df_clients_clean.client)].copy()
display(f"Removed {len(DATAFRAMES['df_trader']) - len(df_trader_clean)} orphaned trader records")
display(f"Remaining traders: {len(df_trader_clean)}")

'Removed 40 orphaned trader records'

'Remaining traders: 7764'

### 3.2 Remove transactions with non-existent logins

In [11]:
orphaned_trans = df_trans_clean[~df_trans_clean.login.isin(df_trader_clean.login)]
display(f"Found {len(orphaned_trans)} transactions with non-existent logins")
display(f"Percentage: {len(orphaned_trans) / len(df_trans_clean) * 100:.2f}%")

'Found 91 transactions with non-existent logins'

'Percentage: 1.16%'

In [12]:
df_trans_clean = df_trans_clean[df_trans_clean.login.isin(df_trader_clean.login)].copy()
display(f"Removed {len(orphaned_trans)} orphaned transaction records")
display(f"Remaining transactions: {len(df_trans_clean)}")

'Removed 91 orphaned transaction records'

'Remaining transactions: 7763'

### 3.3 Verify first_deposit_id references

In [13]:
invalid_first_deposits = df_trader_clean[df_trader_clean.first_deposit_id.notna() & ~df_trader_clean.first_deposit_id.isin(df_trans_clean.transaction_id)]
display(f"Found {len(invalid_first_deposits)} traders with invalid first_deposit_id references")


'Found 1 traders with invalid first_deposit_id references'

In [14]:
if len(invalid_first_deposits) > 0:
    display("Setting invalid first_deposit_id to null")
    df_trader_clean.loc[
        (invalid_first_deposits.login.values == df_trader_clean.login.values) &
        (invalid_first_deposits.client.values == df_trader_clean.client.values) &
        (invalid_first_deposits.account_id.values == df_trader_clean.account_id.values), 'first_deposit_id'] = None
    display(f"Set {len(invalid_first_deposits)} first_deposit_id values to null")

'Setting invalid first_deposit_id to null'

'Set 1 first_deposit_id values to null'

## 4. Data Type Conversions

In [15]:
id_columns = ['account_id', 'first_deposit_id']
for col in id_columns:
    DATAFRAMES['df_trader'][col] = DATAFRAMES['df_trader'][col].astype('Int64')

In [17]:
display(DATAFRAMES['df_trader'].info())
display(DATAFRAMES['df_trader'].head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7804 entries, 0 to 7803
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   login             7804 non-null   int64
 1   client            7804 non-null   int64
 2   account_id        7611 non-null   Int64
 3   first_deposit_id  7740 non-null   Int64
dtypes: Int64(2), int64(2)
memory usage: 259.2 KB


None

Unnamed: 0,login,client,account_id,first_deposit_id
0,3639642,25831,570086139,37032272
1,3648174,75264,702559253,37112579
2,3650697,89278,717548133,35428843
3,3651651,52437,733441170,35186270
4,3658908,32725,820931437,36992110
