In [None]:
import pandas as pd

# Data validation

The goal is to ensure data integrity and catch issues early.

In [None]:
df_trans = pd.read_csv('../../data/raw/trans.csv', delimiter=';', parse_dates=['created_at'], date_format='ISO8601')
df_trader = pd.read_csv('../../data/raw/trader.csv', delimiter=';')
df_clients = pd.read_csv('../../data/raw/clients.csv', delimiter=';', parse_dates=['_created_on'], date_format='ISO8601')
df_partner_codes = pd.read_csv('../../data/raw/partner_codes.csv', delimiter=';', parse_dates=['_created_on'], date_format='ISO8601')
df_a2p_ref = pd.read_csv('../../data/raw/a2p_ref.csv', delimiter=';')

In [None]:
display("Preview of df_trans:")
display(df_trans.head())
display("Preview of df_trader:")
display(df_trader.head())
display("Preview of df_clients:")
display(df_clients.head())
display("Preview of df_partner_codes:")
display(df_partner_codes.head())
display("Preview of df_a2p_ref:")
display(df_a2p_ref.head())

### Check for expected columns

In [None]:
display(f'Are all expected columns in trans.csv? {set(df_trans.columns) == set(["login", "created_at", "transaction_id", "amount", "currency"])}')
display(f'Are all expected columns in trader.csv? {set(df_trader.columns) == set(["login", "client", "account_id", "first_deposit_id"])}')
display(f'Are all expected columns in clients.csv? {set(df_clients.columns) == set(["client", "client_id", "_created_on", "type"])}')
display(f'Are all expected columns in partner_codes.csv? {set(df_partner_codes.columns) == set(["partner_code_id", "_created_on","code"])}')
display(f'Are all expected columns in a2p_ref.csv? {set(df_a2p_ref.columns) == set(["ref_id", "account_id", "partner_code_id"])}')

### Verify data types

In [None]:
display("Data types of df_trans:")
display(df_trans.info())
display("Data types of df_trader:")
display(df_trader.info())
display("Data types of df_clients:")
display(df_clients.info())
display("Data types of df_partner_codes:")
display(df_partner_codes.info())
display("Data types of df_a2p_ref:")
display(df_a2p_ref.info())

### Verify uniqueness constraints

In [None]:
display(f'Are all transaction_id values unique? {df_trans.transaction_id.is_unique}')
display(f'Are all login values unique in trader? {df_trader.login.is_unique}')
display(f'Are all client values unique in clients? {df_clients.client.is_unique}')
display(f'Are all partner_code_id values unique? {df_partner_codes.partner_code_id.is_unique}')
display(f'Are all code values unique in partner_codes? {df_partner_codes.code.is_unique}')
display(f'Are all ref_id values unique in a2p_ref? {df_a2p_ref.ref_id.is_unique}')

### Referential integrity checks

In [None]:
# ensures every trading account belongs to a real client
display('Is every client in the trader table in the clients table?')
display(set(df_trader.client).issubset(set(df_clients.client)))

# #ensures every transaction belongs to a real trading account
display('Is every login in the trans table in the trader table?')
display(set(df_trans.login).issubset(set(df_trader.login)))

# ensures the "first deposit" reference points to an actual transaction
display('Is every first_deposit_id in the trader table in the trans table?')
display(set(df_trader.first_deposit_id.dropna()).issubset(set(df_trans.transaction_id)))

# ensures partner codes are linked to real accounts
display('Is every account_id in the a2p_ref table in the trader table?')
display(set(df_a2p_ref.account_id).issubset(set(df_trader.account_id.dropna())))

# ensures partner references are valid
display('Is every partner_code_id in the a2p_ref table in the partner_codes table?')
display(set(df_a2p_ref.partner_code_id).issubset(set(df_partner_codes.partner_code_id)))

### Check logical constraints

In [None]:
display('Are all amounts higher than 0?')
display(all(df_trans.amount > 0))

display('Are all registration types valid?')
display(set(df_clients.type.unique()) in ['Full', 'Light', 'Demo'])

# Conclusion

- not all `code` values are unique while `partner_code_id` is - repeated registrations of partners?
- some critical fields (FKs) are not non-null 
- convert currencies
- in `df_trader`, convert `account_id` and `first_deposit_id` to `int`
- in `df_a2p_ref`, convert `partner_code_id` to `int`
- parse dates and paste explicit format where neccessary
- there are trading accounts that reference clients that do not exist in the clients table - check how many and if not significant, drop them
- there are logins in the transaction table that reference traders that do not exist - check how many and if not significant, drop them
- there are account-to-partner partner ID references that do not have a valid value in the partner codes table - frop them, not essential for FTD calculation, only needed for attribution
- the amount is not always greater than 0 - remove invalid amounts
- the registration types are not always in the valid values - remove invalid registration types