In [None]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from constants import DATAFRAMES_CLEAN

# Data Exploration

Goal: Understand patterns, distributions, and relationships in the cleaned data.

In [None]:
for name in DATAFRAMES_CLEAN.keys():
    display(f'{name} {len(DATAFRAMES_CLEAN[name])} records')

## 1. Univariate Analysis

### 1.1 Clients Analysis

In [None]:
display('Client count by registration type:')
type_counts = DATAFRAMES_CLEAN['df_clients']['type'].value_counts()
display(type_counts)

fig = px.bar(x=type_counts.index, y=type_counts.values,
             title='Clients by Registration Type',
             labels={'x': 'Registration Type', 'y': 'Count'},
             color=type_counts.index)
fig.show()

In [None]:
DATAFRAMES_CLEAN['df_clients']['registration_date'] = DATAFRAMES_CLEAN['df_clients']['_created_on'].dt.date
registrations_by_date = DATAFRAMES_CLEAN['df_clients'].groupby('registration_date').size().reset_index(name='count')

fig = px.line(registrations_by_date, x='registration_date', y='count',
              title='Client Registrations Over Time',
              labels={'registration_date': 'Date', 'count': 'Number of Registrations'})
fig.show()

### 1.2 Trader Analysis

In [None]:
accounts_per_client = DATAFRAMES_CLEAN['df_trader'].groupby('client').size()

display('Accounts per client - statistics:')
display(accounts_per_client.describe())

acc_dist = accounts_per_client.value_counts().sort_index().reset_index()
acc_dist.columns = ['num_accounts', 'num_clients']

fig = px.bar(acc_dist, x='num_accounts', y='num_clients',
             title='Distribution of Accounts per Client',
             labels={'num_accounts': 'Number of Accounts', 'num_clients': 'Number of Clients'})
fig.show()

In [None]:
has_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].notna().sum()
no_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].isna().sum()

display(f'Traders with first_deposit_id: {has_first_deposit} ({has_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')
display(f'Traders without first_deposit_id: {no_first_deposit} ({no_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')

fig = px.pie(values=[has_first_deposit, no_first_deposit], 
             names=['Has First Deposit', 'No First Deposit'],
             title='Traders with First Deposit ID',
             color_discrete_sequence=['seagreen', 'lightgray'])
fig.show()

### 1.3 Transactions Analysis

In [None]:
DATAFRAMES_CLEAN['df_trans']['trans_date'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.date
trans_by_date = DATAFRAMES_CLEAN['df_trans'].groupby('trans_date').size().reset_index(name='count')

fig = px.line(trans_by_date, x='trans_date', y='count',
              title='Transaction Volume Over Time',
              labels={'trans_date': 'Date', 'count': 'Number of Transactions'})
fig.show()

In [None]:
display('Transaction amount (USD) statistics:')
display(DATAFRAMES_CLEAN['df_trans']['amount_USD'].describe())

fig = px.histogram(DATAFRAMES_CLEAN['df_trans'], x='amount_USD', nbins=50,
                   title='Distribution of Transaction Amounts (USD)',
                   labels={'amount_USD': 'Amount (USD)'})
fig.update_traces(xbins=dict(start=0))
fig.update_layout(height=500, showlegend=False)
fig.update_xaxes(range=[0, DATAFRAMES_CLEAN['df_trans']['amount_USD'].max()])
fig.show()

## 2. Bivariate Analysis

### 2.1 Clients vs Traders: Logins per Client

In [None]:
logins_per_client = DATAFRAMES_CLEAN['df_trader'].groupby('client')['login'].count()

display('Logins per client - statistics:')
display(logins_per_client.describe())

display(f'Clients with 1 login: {(logins_per_client == 1).sum()}')
display(f'Clients with 2+ logins: {(logins_per_client >= 2).sum()}')
display(f'Clients with 5+ logins: {(logins_per_client >= 5).sum()}')

### 2.2 Traders vs Transactions: Transaction Frequency per Login

In [None]:
# every transaction in clean df_trans has transaction_id - size of the group is referential to transaction frequency
trans_per_login = DATAFRAMES_CLEAN['df_trans'].groupby('login')['transaction_id'].count()

display('Transactions per login - statistics:')
display(trans_per_login.describe())

### 2.3 Time Patterns: Deposits by Day/Week/Month

In [None]:
DATAFRAMES_CLEAN['df_trans']['day_of_week'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
trans_by_dow = DATAFRAMES_CLEAN['df_trans']['day_of_week'].value_counts().reindex(day_order).reset_index()
trans_by_dow.columns = ['day', 'count']

fig = px.bar(trans_by_dow, x='day', y='count',
             title='Transactions by Day of Week',
             labels={'day': 'Day of Week', 'count': 'Number of Transactions'})
fig.show()

In [None]:
DATAFRAMES_CLEAN['df_trans']['hour'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.hour
trans_by_hour = DATAFRAMES_CLEAN['df_trans'].groupby('hour').size().reset_index(name='count')

fig = px.bar(trans_by_hour, x='hour', y='count',
             title='Transactions by Hour of Day',
             labels={'hour': 'Hour', 'count': 'Number of Transactions'})
fig.show()

NOTE: May be heavily influenced by the absence of time zone.

In [None]:
DATAFRAMES_CLEAN['df_trans']['year_week'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.strftime('%Y-W%W')
trans_by_week = DATAFRAMES_CLEAN['df_trans'].groupby('year_week').agg(
    count=('transaction_id', 'count'),
    total_amount_usd=('amount_USD', 'sum')
).reset_index()

fig = make_subplots(rows=2, cols=1, subplot_titles=('Transaction Count by Week', 
                                                     'Total Transaction Amount (USD) by Week'))

fig.add_trace(go.Scatter(x=trans_by_week['year_week'], y=trans_by_week['count'], 
                         mode='lines+markers', name='Count'), row=1, col=1)
fig.add_trace(go.Scatter(x=trans_by_week['year_week'], y=trans_by_week['total_amount_usd'], 
                         mode='lines+markers', name='Amount'), row=2, col=1)

fig.update_layout(height=600, showlegend=False)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Amount (USD)', row=2, col=1)
fig.show()

## 3. Relationship Analysis

### 3.1 Complete Picture: Joined Data

In [None]:
df_full = (
    DATAFRAMES_CLEAN['df_trans']
    .merge(DATAFRAMES_CLEAN['df_trader'], on='login', how='left')
    .merge(DATAFRAMES_CLEAN['df_clients'], on='client', how='left')
)

display(f'Full joined dataset: {len(df_full)} records')
display(df_full.head())

### 3.2 What % of Clients Have Made Deposits?

In [None]:
# every client in clean df_full has transaction_id - they made at least one transaction
clients_with_deposits = df_full[df_full['transaction_id'].notna()]['client'].nunique()
total_clients = DATAFRAMES_CLEAN['df_clients']['client'].nunique()

display(f'Total clients: {total_clients}')
display(f'Clients with deposits: {clients_with_deposits}')
display(f'Percentage: {clients_with_deposits/total_clients*100:.1f}%')

fig = px.pie(values=[clients_with_deposits, total_clients - clients_with_deposits], 
             names=['With Deposits', 'No Deposits'],
             title='Clients with Deposits',
             color_discrete_sequence=['seagreen', 'lightgray'])
fig.show()

### 3.3 Time to First Deposit Analysis

In [None]:
client_first_deposit = df_full.groupby('client').agg(
    first_deposit_date=('created_at', 'min'),
    registration_date=('_created_on', 'min'),
    ftd_amount=('amount_USD', 'first')
).reset_index()


client_first_deposit['days_to_ftd'] = (
    client_first_deposit['first_deposit_date'] - client_first_deposit['registration_date']
).dt.total_seconds() / 86400

display('Time to First Deposit (days) - Statistics:')
display(client_first_deposit['days_to_ftd'].describe())

In [None]:
fig = px.histogram(client_first_deposit, x='days_to_ftd', nbins=50,
                   title='Distribution of Time to First Deposit (Days)',
                   labels={'days_to_ftd': 'Days from Registration to First Deposit'})
fig.update_traces(xbins=dict(start=0))
fig.update_layout(height=500, showlegend=False)
fig.show()

# breakdown: same day vs later (as statistics shows giant values)
same_day = (client_first_deposit['days_to_ftd'] < 1).sum()
within_week = ((client_first_deposit['days_to_ftd'] >= 1) & (client_first_deposit['days_to_ftd'] < 7)).sum()
within_month = ((client_first_deposit['days_to_ftd'] >= 7) & (client_first_deposit['days_to_ftd'] < 30)).sum()
later = (client_first_deposit['days_to_ftd'] >= 30).sum()

display(f'Same day (< 1 day): {same_day} ({same_day/len(client_first_deposit)*100:.1f}%)')
display(f'Within first week (1-7 days): {within_week} ({within_week/len(client_first_deposit)*100:.1f}%)')
display(f'Within first month (7-30 days): {within_month} ({within_month/len(client_first_deposit)*100:.1f}%)')
display(f'After 30 days: {later} ({later/len(client_first_deposit)*100:.1f}%)')

In [None]:
fig = px.box(client_first_deposit.merge(DATAFRAMES_CLEAN['df_clients'][['client', 'type']], on='client'),
             x='type', y='days_to_ftd',
             title='Time to First Deposit by Registration Type',
             labels={'type': 'Registration Type', 'days_to_ftd': 'Days to First Deposit'})
fig.show()

## 4. Summary Statistics

In [None]:
summary = {
    'Total Clients': total_clients,
    'Total Logins': len(DATAFRAMES_CLEAN['df_trader']),
    'Total Transactions': len(DATAFRAMES_CLEAN['df_trans']),
    'Clients with Deposits': clients_with_deposits,
    'Client Deposit Rate': f'{clients_with_deposits/total_clients*100:.1f}%',
    'Avg Logins per Client': f'{logins_per_client.mean():.2f}',
    'Avg Transactions per Login': f'{trans_per_login.mean():.2f}',
    'Total Volume (USD)': f'{DATAFRAMES_CLEAN['df_trans']["amount_USD"].sum():.2f}',
    'Avg Transaction (USD)': f'{DATAFRAMES_CLEAN['df_trans']["amount_USD"].mean():.2f}'
}

display('SUMMARY STATISTICS')
for key, value in summary.items():
    display(f'{key}: {value}')

## 5. Key Findings

1. **Client Distribution**: The dataset contains 5325 clients, predominantly Full registrations (85%, 4530), followed by Demo (8%, 431) and Light (7%, 364). This indicates a mature client base with most users completing full registration.

2. **High Deposit Rate**: 99.4% of clients (5294 out of 5325) have made at least one deposit. This exceptionally high rate is consistent across all registration types (~99% each), suggesting the dataset may represent active/depositing clients rather than the full client population.

3. **Account Structure**: Most clients (74%, 3914) have a single trading account, while 26% have multiple accounts. The average is 1.47 logins per client, with one outlier having 43 accounts (looks like service/test account). 

4. **Transaction Characteristics**:
   - Total volume: 4.55M USD across 7763 transactions
   - Average transaction: 585.92 USD
   - Median transaction: 220.37 USD (right-skewed distribution influenced by large outliers)
   - Each login has exactly 1 transaction in this dataset, indicating these are likely first deposits only

5. **Time-to-FTD Distribution**: Median 6.5 years from registration to FTD. This reflects the transaction data covering only a recent period (2025) while registrations date back years earlier - not actual client behavior delay.