In [None]:
import pandas as pd

from functions import check_expected_columns, check_dataframe_info, verify_uniqueness_constraints
from constants import DATAFRAMES, EXPECTED_COLUMNS 

# Data validation

The goal is to ensure data integrity and catch issues early.

In [None]:
for name, df in DATAFRAMES.items():
    display(f'Preview of {name}:')
    display(df.head())

### Check for expected columns

In [None]:
for name, df in DATAFRAMES.items():
    display(check_expected_columns(df, name, EXPECTED_COLUMNS[name]))

### Verify data types

In [None]:
for name, df in DATAFRAMES.items():
    display(check_dataframe_info(df, name))

### Verify uniqueness constraints

In [None]:
for name, df in DATAFRAMES.items():
    display(verify_uniqueness_constraints(df, name, EXPECTED_COLUMNS[name][0]))

### Referential integrity checks

In [None]:
# ensures every trading account belongs to a real client
display('Is every client in the trader table in the clients table?')
display(set(DATAFRAMES['df_trader'].client).issubset(set(DATAFRAMES['df_clients'].client)))

# #ensures every transaction belongs to a real trading account
display('Is every login in the trans table in the trader table?')
display(set(DATAFRAMES['df_trans'].login).issubset(set(DATAFRAMES['df_trader'].login)))

# ensures the "first deposit" reference points to an actual transaction
display('Is every first_deposit_id in the trader table in the trans table?')
display(set(DATAFRAMES['df_trader'].first_deposit_id.dropna()).issubset(set(DATAFRAMES['df_trans'].transaction_id)))

# ensures partner codes are linked to real accounts
display('Is every account_id in the a2p_ref table in the trader table?')
display(set(DATAFRAMES['df_a2p_ref'].account_id).issubset(set(DATAFRAMES['df_trader'].account_id.dropna())))

# ensures partner references are valid
display('Is every partner_code_id in the a2p_ref table in the partner_codes table?')
display(set(DATAFRAMES['df_a2p_ref'].partner_code_id).issubset(set(DATAFRAMES['df_partner_codes'].partner_code_id)))

### Check logical constraints

In [None]:
display('Are all amounts higher than 0?')
display(all(DATAFRAMES['df_trader'].amount > 0))

display('Are all registration types valid?')
display(set(DATAFRAMES['df_clients'].type.unique()) in ['Full', 'Light', 'Demo'])

# Conclusion

- not all `code` values are unique while `partner_code_id` is - repeated registrations of partners?
- some critical fields (FKs) are not non-null 
- convert currencies
- in `df_trader`, convert `account_id` and `first_deposit_id` to `int`
- in `df_a2p_ref`, convert `partner_code_id` to `int`
- parse dates and paste explicit format where neccessary
- there are trading accounts that reference clients that do not exist in the clients table - check how many and if not significant, drop them
- there are logins in the transaction table that reference traders that do not exist - check how many and if not significant, drop them
- there are account-to-partner partner ID references that do not have a valid value in the partner codes table - frop them, not essential for FTD calculation, only needed for attribution
- the amount is not always greater than 0 - remove invalid amounts
- the registration types are not always in the valid values - remove invalid registration types