In [None]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from constants import DATAFRAMES_CLEAN

# Data Exploration

Goal: Understand patterns, distributions, and relationships in the cleaned data.

In [None]:
display(f'Clients: {len(DATAFRAMES_CLEAN['df_clients'])} records')
display(f'Traders: {len(DATAFRAMES_CLEAN['df_trader'])} records')
display(f'Transactions: {len(DATAFRAMES_CLEAN['df_trans'])} records')

## 1. Univariate Analysis

### 1.1 Clients Analysis

In [None]:
display('Client count by registration type:')
type_counts = DATAFRAMES_CLEAN['df_clients']['type'].value_counts()
display(type_counts)

fig = px.bar(x=type_counts.index, y=type_counts.values,
             title='Clients by Registration Type',
             labels={'x': 'Registration Type', 'y': 'Count'},
             color=type_counts.index)
fig.show()

In [None]:
DATAFRAMES_CLEAN['df_clients']['registration_date'] = DATAFRAMES_CLEAN['df_clients']['_created_on'].dt.date
registrations_by_date = DATAFRAMES_CLEAN['df_clients'].groupby('registration_date').size().reset_index(name='count')

fig = px.line(registrations_by_date, x='registration_date', y='count',
              title='Client Registrations Over Time',
              labels={'registration_date': 'Date', 'count': 'Number of Registrations'})
fig.show()

### 1.2 Trader Analysis

In [None]:
accounts_per_client = DATAFRAMES_CLEAN['df_trader'].groupby('client').size()

display('Accounts per client - statistics:')
display(accounts_per_client.describe())

acc_dist = accounts_per_client.value_counts().sort_index().reset_index()
acc_dist.columns = ['num_accounts', 'num_clients']

fig = px.bar(acc_dist, x='num_accounts', y='num_clients',
             title='Distribution of Accounts per Client',
             labels={'num_accounts': 'Number of Accounts', 'num_clients': 'Number of Clients'})
fig.show()

In [None]:
has_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].notna().sum()
no_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].isna().sum()

display(f'Traders with first_deposit_id: {has_first_deposit} ({has_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')
display(f'Traders without first_deposit_id: {no_first_deposit} ({no_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')

fig = px.pie(values=[has_first_deposit, no_first_deposit], 
             names=['Has First Deposit', 'No First Deposit'],
             title='Traders with First Deposit ID',
             color_discrete_sequence=['seagreen', 'lightgray'])
fig.show()

### 1.3 Transactions Analysis

In [None]:
DATAFRAMES_CLEAN['df_trans']['trans_date'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.date
trans_by_date = DATAFRAMES_CLEAN['df_trans'].groupby('trans_date').size().reset_index(name='count')

fig = px.line(trans_by_date, x='trans_date', y='count',
              title='Transaction Volume Over Time',
              labels={'trans_date': 'Date', 'count': 'Number of Transactions'})
fig.show()

In [None]:
display('Transaction amount (USD) statistics:')
display(DATAFRAMES_CLEAN['df_trans']['amount_USD'].describe())

fig = px.histogram(DATAFRAMES_CLEAN['df_trans'], x='amount_USD', nbins=50,
                   title='Distribution of Transaction Amounts (USD)',
                   labels={'amount_USD': 'Amount (USD)'})
fig.update_traces(xbins=dict(start=0))
fig.update_layout(height=500, showlegend=False)
fig.update_xaxes(range=[0, DATAFRAMES_CLEAN['df_trans']['amount_USD'].max()])
fig.show()