In [None]:
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from constants import DATAFRAMES_CLEAN

# Data Exploration

Goal: Understand patterns, distributions, and relationships in the cleaned data.

In [None]:
display(f'Clients: {len(DATAFRAMES_CLEAN['df_clients'])} records')
display(f'Traders: {len(DATAFRAMES_CLEAN['df_trader'])} records')
display(f'Transactions: {len(DATAFRAMES_CLEAN['df_trans'])} records')

## 1. Univariate Analysis

### 1.1 Clients Analysis

In [None]:
display('Client count by registration type:')
type_counts = DATAFRAMES_CLEAN['df_clients']['type'].value_counts()
display(type_counts)

fig = px.bar(x=type_counts.index, y=type_counts.values,
             title='Clients by Registration Type',
             labels={'x': 'Registration Type', 'y': 'Count'},
             color=type_counts.index)
fig.show()

In [None]:
DATAFRAMES_CLEAN['df_clients']['registration_date'] = DATAFRAMES_CLEAN['df_clients']['_created_on'].dt.date
registrations_by_date = DATAFRAMES_CLEAN['df_clients'].groupby('registration_date').size().reset_index(name='count')

fig = px.line(registrations_by_date, x='registration_date', y='count',
              title='Client Registrations Over Time',
              labels={'registration_date': 'Date', 'count': 'Number of Registrations'})
fig.show()

### 1.2 Trader Analysis

In [None]:
accounts_per_client = DATAFRAMES_CLEAN['df_trader'].groupby('client').size()

display('Accounts per client - statistics:')
display(accounts_per_client.describe())

acc_dist = accounts_per_client.value_counts().sort_index().reset_index()
acc_dist.columns = ['num_accounts', 'num_clients']

fig = px.bar(acc_dist, x='num_accounts', y='num_clients',
             title='Distribution of Accounts per Client',
             labels={'num_accounts': 'Number of Accounts', 'num_clients': 'Number of Clients'})
fig.show()

In [None]:
has_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].notna().sum()
no_first_deposit = DATAFRAMES_CLEAN['df_trader']['first_deposit_id'].isna().sum()

display(f'Traders with first_deposit_id: {has_first_deposit} ({has_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')
display(f'Traders without first_deposit_id: {no_first_deposit} ({no_first_deposit/len(DATAFRAMES_CLEAN['df_trader'])*100:.1f}%)')

fig = px.pie(values=[has_first_deposit, no_first_deposit], 
             names=['Has First Deposit', 'No First Deposit'],
             title='Traders with First Deposit ID',
             color_discrete_sequence=['seagreen', 'lightgray'])
fig.show()

### 1.3 Transactions Analysis

In [None]:
DATAFRAMES_CLEAN['df_trans']['trans_date'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.date
trans_by_date = DATAFRAMES_CLEAN['df_trans'].groupby('trans_date').size().reset_index(name='count')

fig = px.line(trans_by_date, x='trans_date', y='count',
              title='Transaction Volume Over Time',
              labels={'trans_date': 'Date', 'count': 'Number of Transactions'})
fig.show()

In [None]:
display('Transaction amount (USD) statistics:')
display(DATAFRAMES_CLEAN['df_trans']['amount_USD'].describe())

fig = px.histogram(DATAFRAMES_CLEAN['df_trans'], x='amount_USD', nbins=50,
                   title='Distribution of Transaction Amounts (USD)',
                   labels={'amount_USD': 'Amount (USD)'})
fig.update_traces(xbins=dict(start=0))
fig.update_layout(height=500, showlegend=False)
fig.update_xaxes(range=[0, DATAFRAMES_CLEAN['df_trans']['amount_USD'].max()])
fig.show()

## 2. Bivariate Analysis

### 2.1 Clients vs Traders: Logins per Client

In [None]:
logins_per_client = DATAFRAMES_CLEAN['df_trader'].groupby('client')['login'].count()

display('Logins per client - statistics:')
display(logins_per_client.describe())

display(f'Clients with 1 login: {(logins_per_client == 1).sum()}')
display(f'Clients with 2+ logins: {(logins_per_client >= 2).sum()}')
display(f'Clients with 5+ logins: {(logins_per_client >= 5).sum()}')

### 2.2 Traders vs Transactions: Transaction Frequency per Login

In [None]:
trans_per_login = DATAFRAMES_CLEAN['df_trans'].groupby('login').size()

display('Transactions per login - statistics:')
display(trans_per_login.describe())

### 2.3 Time Patterns: Deposits by Day/Week/Month

In [None]:
DATAFRAMES_CLEAN['df_trans']['day_of_week'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.day_name()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
trans_by_dow = DATAFRAMES_CLEAN['df_trans']['day_of_week'].value_counts().reindex(day_order).reset_index()
trans_by_dow.columns = ['day', 'count']

fig = px.bar(trans_by_dow, x='day', y='count',
             title='Transactions by Day of Week',
             labels={'day': 'Day of Week', 'count': 'Number of Transactions'})
fig.show()

In [None]:
DATAFRAMES_CLEAN['df_trans']['hour'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.hour
trans_by_hour = DATAFRAMES_CLEAN['df_trans'].groupby('hour').size().reset_index(name='count')

fig = px.bar(trans_by_hour, x='hour', y='count',
             title='Transactions by Hour of Day',
             labels={'hour': 'Hour', 'count': 'Number of Transactions'})
fig.show()

NOTE: May be heavily influenced by the absence of time zone.

In [None]:
DATAFRAMES_CLEAN['df_trans']['year_week'] = DATAFRAMES_CLEAN['df_trans']['created_at'].dt.strftime('%Y-W%W')
trans_by_week = DATAFRAMES_CLEAN['df_trans'].groupby('year_week').agg(
    count=('transaction_id', 'count'),
    total_amount_usd=('amount_USD', 'sum')
).reset_index()

fig = make_subplots(rows=2, cols=1, subplot_titles=('Transaction Count by Week', 
                                                     'Total Transaction Amount (USD) by Week'))

fig.add_trace(go.Scatter(x=trans_by_week['year_week'], y=trans_by_week['count'], 
                         mode='lines+markers', name='Count'), row=1, col=1)
fig.add_trace(go.Scatter(x=trans_by_week['year_week'], y=trans_by_week['total_amount_usd'], 
                         mode='lines+markers', name='Amount'), row=2, col=1)

fig.update_layout(height=600, showlegend=False)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Amount (USD)', row=2, col=1)
fig.show()