In [1]:
import pandas as pd

from functions import check_expected_columns, check_dataframe_info, verify_uniqueness_constraints
from constants import DATAFRAMES, EXPECTED_COLUMNS 

# Data validation

The goal is to ensure data integrity and catch issues early.

In [2]:
for name, df in DATAFRAMES.items():
    display(f'Preview of {name}:')
    display(df.head())

'Preview of df_trans:'

Unnamed: 0,login,created_at,transaction_id,amount,currency
0,27061833,2025-02-27 16:24:00,13203755,10000.0,JPY
1,6087675,2025-02-28 16:04:00,17434895,100000.0,JPY
2,5471727,2025-02-28 16:16:00,14728768,30000.0,JPY
3,6634899,2025-02-28 16:29:00,6104013,30000.0,JPY
4,6629259,2025-02-27 21:31:00,6095829,50000.0,JPY


'Preview of df_trader:'

Unnamed: 0,login,client,account_id,first_deposit_id
0,3639642,25831,570086139.0,37032272.0
1,3648174,75264,702559253.0,37112579.0
2,3650697,89278,717548133.0,35428843.0
3,3651651,52437,733441170.0,35186270.0
4,3658908,32725,820931437.0,36992110.0


'Preview of df_clients:'

Unnamed: 0,client,client_id,_created_on,type
0,72553,309881572,2015-08-17 07:33:01,Full
1,19159,310010888,2015-08-17 13:01:16,Full
2,52039,310010894,2015-08-17 13:01:17,Full
3,91637,310010897,2015-08-17 13:01:17,Full
4,31745,310010905,2015-08-17 13:01:17,Full


'Preview of df_partner_codes:'

Unnamed: 0,partner_code_id,code,_created_on
0,1353870337,19624,2020-02-27 06:23:07
1,2030097110,20057,2022-03-08 14:22:59
2,3088246029,20823,2025-05-19 07:49:20
3,3160790024,20864,2025-09-09 08:44:37
4,3201231431,20879,2025-11-17 08:10:55


'Preview of df_a2p_ref:'

Unnamed: 0,account_id,ref_id,partner_code_id
0,570086139,167481607,346849860.0
1,570086139,167481608,
2,570086139,167481609,
3,702559253,167497004,308750404.0
4,702559253,167497005,


### Check for expected columns

In [3]:
for name, df in DATAFRAMES.items():
    display(check_expected_columns(df, name, EXPECTED_COLUMNS[name]))

'Are all expected columns in df_trans?'

True

'Are all expected columns in df_trader?'

True

'Are all expected columns in df_clients?'

True

'Are all expected columns in df_partner_codes?'

False

'Are all expected columns in df_a2p_ref?'

True

### Verify data types

In [4]:
for name, df in DATAFRAMES.items():
    display(check_dataframe_info(df, name))

'Data types of df_trans:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7855 entries, 0 to 7854
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   login           7855 non-null   int64         
 1   created_at      7855 non-null   datetime64[ns]
 2   transaction_id  7855 non-null   int64         
 3   amount          7855 non-null   float64       
 4   currency        7855 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 307.0+ KB


None

'Data types of df_trader:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7804 entries, 0 to 7803
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   login             7804 non-null   int64  
 1   client            7804 non-null   int64  
 2   account_id        7611 non-null   float64
 3   first_deposit_id  7740 non-null   float64
dtypes: float64(2), int64(2)
memory usage: 244.0 KB


None

'Data types of df_clients:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5327 entries, 0 to 5326
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   client       5327 non-null   int64         
 1   client_id    5327 non-null   int64         
 2   _created_on  5327 non-null   datetime64[ns]
 3   type         5327 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 166.6+ KB


None

'Data types of df_partner_codes:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1592 entries, 0 to 1591
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   partner_code_id  1592 non-null   int64         
 1   code             1592 non-null   int64         
 2   _created_on      1592 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 37.4 KB


None

'Data types of df_a2p_ref:'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7733 entries, 0 to 7732
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   account_id       7733 non-null   int64  
 1   ref_id           7733 non-null   int64  
 2   partner_code_id  7611 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 181.4 KB


None

### Verify uniqueness constraints

In [5]:
for name, df in DATAFRAMES.items():
    display(verify_uniqueness_constraints(df, name, EXPECTED_COLUMNS[name][0]))

'Are all critical columns in df_trans unique?'

True

'Are all critical columns in df_trader unique?'

True

'Are all critical columns in df_clients unique?'

True

'Are all critical columns in df_partner_codes unique?'

False

'Are all critical columns in df_a2p_ref unique?'

True

### Referential integrity checks

In [6]:
# ensures every trading account belongs to a real client
display('Is every client in the trader table in the clients table?')
display(set(DATAFRAMES['df_trader'].client).issubset(set(DATAFRAMES['df_clients'].client)))

# #ensures every transaction belongs to a real trading account
display('Is every login in the trans table in the trader table?')
display(set(DATAFRAMES['df_trans'].login).issubset(set(DATAFRAMES['df_trader'].login)))

# ensures the "first deposit" reference points to an actual transaction
display('Is every first_deposit_id in the trader table in the trans table?')
display(set(DATAFRAMES['df_trader'].first_deposit_id.dropna()).issubset(set(DATAFRAMES['df_trans'].transaction_id)))

# ensures partner codes are linked to real accounts
display('Is every account_id in the a2p_ref table in the trader table?')
display(set(DATAFRAMES['df_a2p_ref'].account_id).issubset(set(DATAFRAMES['df_trader'].account_id.dropna())))

# ensures partner references are valid
display('Is every partner_code_id in the a2p_ref table in the partner_codes table?')
display(set(DATAFRAMES['df_a2p_ref'].partner_code_id).issubset(set(DATAFRAMES['df_partner_codes'].partner_code_id)))

'Is every client in the trader table in the clients table?'

False

'Is every login in the trans table in the trader table?'

False

'Is every first_deposit_id in the trader table in the trans table?'

True

'Is every account_id in the a2p_ref table in the trader table?'

True

'Is every partner_code_id in the a2p_ref table in the partner_codes table?'

False

### Check logical constraints

In [8]:
display('Are all amounts higher than 0?')
display(all(DATAFRAMES['df_trans'].amount > 0))

display('Are all registration types valid?')
display(set(DATAFRAMES['df_clients'].type.unique()) in ['Full', 'Light', 'Demo'])

'Are all amounts higher than 0?'

False

'Are all registration types valid?'

False

# Conclusion

- not all `code` values are unique while `partner_code_id` is - repeated registrations of partners?
- some critical fields (FKs) are not non-null 
- convert currencies
- in `df_trader`, convert `account_id` and `first_deposit_id` to `int`
- in `df_a2p_ref`, convert `partner_code_id` to `int`
- parse dates and paste explicit format where neccessary
- there are trading accounts that reference clients that do not exist in the clients table - check how many and if not significant, drop them
- there are logins in the transaction table that reference traders that do not exist - check how many and if not significant, drop them
- there are account-to-partner partner ID references that do not have a valid value in the partner codes table - frop them, not essential for FTD calculation, only needed for attribution
- the amount is not always greater than 0 - remove invalid amounts
- the registration types are not always in the valid values - remove invalid registration types