In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from constants import DATAFRAMES_CLEAN

# FTD Analysis

Goal: Calculate First Time Deposit (FTD) metrics at the **client level** - the first deposit a client made across all of their logins.

In [None]:
df = (
    DATAFRAMES_CLEAN['df_trans']
    .merge(DATAFRAMES_CLEAN['df_trader'], on='login')
    .merge(DATAFRAMES_CLEAN['df_clients'], on='client')
)

display(f'Total transactions: {len(df)}')
display(f'Unique clients: {df["client"].nunique()}')

## 1. Core FTD Calculation

In [None]:
df_ftd = df.loc[df.groupby('client')['created_at'].idxmin()].copy()

display(f'Total FTDs: {len(df_ftd)}')
display(df_ftd[['client', 'login', 'transaction_id', 'first_deposit_id', 'created_at', 'amount_USD', 'type']].head(10))

ids_match = all(df_ftd['transaction_id'].values == df_ftd['first_deposit_id'].values)
display(f'Do all transaction_ids match first_deposit_ids? {ids_match}')

In [None]:
display(df_ftd[df_ftd['transaction_id'].values != df_ftd['first_deposit_id']].head())

NOTE: It's the clients without first_deposit_id where transaction_ids and first_deposit_ids do not match. However, they have valid transaction_ids -> copying them into first_deposit_id.

In [None]:
df_ftd['first_deposit_id'] = df_ftd['first_deposit_id'].fillna(df_ftd['transaction_id'])

In [None]:
ids_match_after = all(df_ftd['transaction_id'].values == df_ftd['first_deposit_id'].values)
display(f'Do all transaction_ids match first_deposit_ids? {ids_match_after}')

In [None]:
display('FTD Amount (USD) - Statistics:')
display(df_ftd['amount_USD'].describe())

display(f'Total FTD Volume: ${df_ftd["amount_USD"].sum():.2f}')
display(f'Average FTD: ${df_ftd["amount_USD"].mean():.2f}')
display(f'Median FTD: ${df_ftd["amount_USD"].median():.2f}')

## 2. FTD Over Time

In [None]:
df_ftd['ftd_date'] = df_ftd['created_at'].dt.date
daily_ftd = df_ftd.groupby('ftd_date').agg(
    count=('client', 'count'),
    volume=('amount_USD', 'sum')
).reset_index()

fig = make_subplots(rows=2, cols=1, subplot_titles=('Daily FTD Count', 'Daily FTD Volume (USD)'))

fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['count'], 
                         mode='lines+markers', name='Count'), row=1, col=1)
fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['volume'], 
                         mode='lines+markers', name='Volume'), row=2, col=1)

fig.update_layout(height=600, showlegend=False)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Volume (USD)', row=2, col=1)
fig.show()

In [None]:
# daily_ftd.to_csv('./../../data/report/daily_ftd.csv')

In [None]:
df_ftd['ftd_week'] = df_ftd['created_at'].dt.to_period('W').astype(str)
weekly_ftd = df_ftd.groupby('ftd_week').agg(
    count=('client', 'count'),
    volume=('amount_USD', 'sum'),
    avg_amount=('amount_USD', 'mean')
).reset_index()

fig = px.bar(weekly_ftd, x='ftd_week', y='count',
             title='Weekly FTD Count',
             labels={'ftd_week': 'Week', 'count': 'FTD Count'})
fig.show()

In [None]:
df_ftd['ftd_month'] = df_ftd['created_at'].dt.to_period('M').astype(str)
monthly_ftd = df_ftd[df_ftd['type'].isin(['Full', 'Light'])].groupby('ftd_month').agg(
    count=('client', 'count'),
    volume=('amount_USD', 'sum'),
    avg_amount=('amount_USD', 'mean')
).reset_index()

display('Monthly FTD Summary:')
display(monthly_ftd)

In [None]:
# monthly_ftd.to_csv('./../../data/report/monthly_ftd.csv')

## 3. FTD by Registration Type

In [None]:
ftd_by_type = df_ftd.groupby('type').agg(
    count=('client', 'count'),
    total_volume=('amount_USD', 'sum'),
    avg_amount=('amount_USD', 'mean'),
    median_amount=('amount_USD', 'median')
).reset_index()

display('FTD by Registration Type:')
display(ftd_by_type)

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('FTD Count by Type', 'FTD Amount Distribution<br>by Type'))

fig.add_trace(go.Bar(x=ftd_by_type['type'], y=ftd_by_type['count'], name='Count'), row=1, col=1)
fig.add_trace(go.Box(x=df_ftd['type'], y=df_ftd['amount_USD'], name='Amount'), row=1, col=2)

fig.update_layout(height=400, showlegend=False)
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Amount (USD)', row=1, col=2)
fig.show()

# Key Metrics Summary

In [None]:
total_clients = DATAFRAMES_CLEAN['df_clients']['client'].nunique()
clients_with_ftd = df_ftd['client'].nunique()

kpis = {
    'Total Clients': total_clients,
    'Clients with FTD': clients_with_ftd,
    'FTD Conversion Rate': f'{clients_with_ftd/total_clients*100:.1f}%',
    'Total FTD Volume (USD)': f'${df_ftd["amount_USD"].sum():,.2f}',
    'Average FTD (USD)': f'${df_ftd["amount_USD"].mean():,.2f}',
    'Median FTD (USD)': f'${df_ftd["amount_USD"].median():,.2f}',
    'FTD by Full Registration': f'{len(df_ftd[df_ftd["type"]=="Full"])} ({len(df_ftd[df_ftd["type"]=="Full"])/len(df_ftd)*100:.1f}%)',
    'FTD by Light Registration': f'{len(df_ftd[df_ftd["type"]=="Light"])} ({len(df_ftd[df_ftd["type"]=="Light"])/len(df_ftd)*100:.1f}%)',
    'FTD by Demo Registration': f'{len(df_ftd[df_ftd["type"]=="Demo"])} ({len(df_ftd[df_ftd["type"]=="Demo"])/len(df_ftd)*100:.1f}%)'
}

display('FTD KEY METRICS')
for key, value in kpis.items():
    display(f'{key}: {value}')

# Time Series FTD Prediction for the Next Month 

In [None]:
# test moving average on existing data
split_idx = int(len(daily_ftd) * 0.8)
train = daily_ftd.iloc[:split_idx].copy()
test = daily_ftd.iloc[split_idx:].copy()

results = []

# different MA windows help find the optimal granurality
for window in [3, 5, 7, 14, 21]:
    ma_value = train['count'].rolling(window).mean().iloc[-1]
    mae = mean_absolute_error(test['count'], [ma_value] * len(test))
    rmse = np.sqrt(mean_squared_error(test['count'], [ma_value] * len(test)))
    results.append((f'MA({window})', window, ma_value, mae, rmse))

comparison = pd.DataFrame(results, columns=['Method', 'Window', 'Forecast', 'MAE', 'RMSE'])
display(comparison.sort_values('MAE'))

best_window = comparison.loc[comparison['MAE'].idxmin(), 'Window']
daily_ftd['ma_best'] = daily_ftd['count'].rolling(best_window).mean()

fig = go.Figure()
fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['count'], name='Actual', mode='lines'))
fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['ma_best'], name=f'MA({best_window})', mode='lines'))
fig.update_layout(title=f'FTD Count with Best Moving Average (MA{best_window})', 
                  xaxis_title='Date', yaxis_title='Count')
fig.show()

In [None]:
 # fit MA(5) - using last 5 values to forecast each day
window = 5

history_count = list(daily_ftd['count'].tail(window))
history_volume = list(daily_ftd['volume'].tail(window))

forecast_count = []
forecast_volume = []

for _ in range(30):
    pred_count = np.mean(history_count[-window:])
    pred_volume = np.mean(history_volume[-window:])
    
    forecast_count.append(pred_count)
    forecast_volume.append(pred_volume)
    
    history_count.append(pred_count)
    history_volume.append(pred_volume)

last_date = pd.to_datetime(daily_ftd['ftd_date'].max())
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=30)

forecast_df = pd.DataFrame({
    'ftd_date': future_dates,
    'count': forecast_count,
    'volume': forecast_volume
})

display('MA(5) Daily Forecast:')
display(forecast_df)
display(f'\nTotal Count: {sum(forecast_count):.0f}')
display(f'Total Volume: ${sum(forecast_volume):,.2f}')

# Visualize
fig = make_subplots(rows=2, cols=1, subplot_titles=('FTD Count Forecast', 'FTD Volume Forecast'))

fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['count'], name='Actual', mode='lines'), row=1, col=1)
fig.add_trace(go.Scatter(x=future_dates, y=forecast_count, name='Forecast', mode='lines', line=dict(dash='dash')), row=1, col=1)

fig.add_trace(go.Scatter(x=daily_ftd['ftd_date'], y=daily_ftd['volume'], name='Actual', mode='lines'), row=2, col=1)
fig.add_trace(go.Scatter(x=future_dates, y=forecast_volume, name='Forecast', mode='lines', line=dict(dash='dash')), row=2, col=1)

fig.update_layout(height=600, title='30-Day Rolling MA(5) Forecast')
fig.update_yaxes(title_text='Count', row=1, col=1)
fig.update_yaxes(title_text='Volume (USD)', row=2, col=1)
fig.show()

In [None]:
# forecast_df.to_csv('./../../data/report/forecast_daily_ftd.csv')