# Problem statement: 
> Determine the customer lifetime value of the top users assuming they haven’t made any prior purchases

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

## 2. Read the dataset

In [2]:
purchase_history = pd.read_csv('dataset.csv')
purchase_history['timestamp'] = purchase_history['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S.%f') + timedelta(hours=5, minutes=30))

## 3. Define CLV

> Customer lifetime value (CLV) is a prediction of the net profit attributed to the entire future relationship with a customer. It represents the monetary value of the revenue that a customer will generate over the entire course of their relationship with a company.

> CLV is an important metric because it can help a business understand the long-term value of its customer base and make informed decisions about acquisition, retention, and investment strategies. A customer with a high CLV is generally considered to be more valuable than a customer with a low CLV, because they are more likely to generate more revenue over time.

> There are several methods to calculate CLV, the most common are:
> - Historical Method: which uses the past data to calculate the CLV by multiplying the average purchase value by the number of purchases per year, multiplied by the average customer lifespan.
> - Predictive method: this method uses customer data and statistical models to predict future CLV. This can be done through different techniques like survival analysis, gradient boosting, etc
> - Cohort analysis : which groups customers based on when they first became a customer and analyse how they behave over time.

> In this assignment we will be using the first method, other approaches can also be experimented with but due to timeline of this assignment I am skipping the other two approaches.

> The most basic formula for calculating customer lifetime value (CLV) using first approach is:

> CLV = (average purchase value) x (number of purchases per year) x (average customer lifespan)

> In terms of churn rate we can use:

> CLV = (average revenue per customer per period) x (number of purchases in customer lifetime) / (1 + customer churn rate)



## 4. Get the top users

In [145]:
# For getting the top users we have multiple approaches
# Here lets get the top users based on average purchase value
# Take top 10 percent users based on AOV

In [146]:
purchase_history_agg = purchase_history.groupby('user_id').agg({"purchase_value":["sum","mean", "count"]}).reset_index(col_level=1)
purchase_history_agg.columns = ['user_id', 'total_value', 'aov', "n_order"]

In [147]:
top10per_aov_user_df = purchase_history_agg[purchase_history_agg['aov']>=purchase_history_agg['aov'].quantile(0.9)]
top10per_aov_user_df.head(3)

Unnamed: 0,user_id,total_value,aov,n_order
24,0043a4ae6a8ba51114ea2a84784e35aa092ecc04ef2239...,1179.0,589.5,2
33,005a7d7f5b9f8c3418d2400e9f1c444e463c13eee6491a...,2655.0,1327.5,2
45,0077ecd6eb5a9240d81f9a1ca28878b333a971dc486066...,1828.0,609.333333,3


## 5. Predict CLV

In [148]:
# Estimate the expected value of future purchases for each user
expected_values = top10per_aov_user_df['aov']

# Set the discount rate
discount_rate = 0.1

# Churn probabilities
churn_probab = pd.read_csv('churn_probability.csv')[['user_id', 'prob']]

In [149]:
data = top10per_aov_user_df.merge(churn_probab, on='user_id', how='left')
data.head(2)

Unnamed: 0,user_id,total_value,aov,n_order,prob
0,0043a4ae6a8ba51114ea2a84784e35aa092ecc04ef2239...,1179.0,589.5,2,1.595512e-08
1,005a7d7f5b9f8c3418d2400e9f1c444e463c13eee6491a...,2655.0,1327.5,2,0.9999765


### 5.1. Overall CLV 
> Direct method to get a rough estimate 

In [150]:
clv_value = (data['aov']*data['n_order']) / (1 + discount_rate - data['prob'])

clv_direct_df = pd.DataFrame(zip(data['user_id'].values, clv_value), columns=['user_id', 'CLV'])
clv_direct_df.head(3)

Unnamed: 0,user_id,CLV
0,0043a4ae6a8ba51114ea2a84784e35aa092ecc04ef2239...,1071.818197
1,005a7d7f5b9f8c3418d2400e9f1c444e463c13eee6491a...,26543.757706
2,0077ecd6eb5a9240d81f9a1ca28878b333a971dc486066...,18279.371478


### 5.2. Using exponential discounting decay factor (discounted cash flow (DCF))

In [163]:
# Calculate the CLV for each user
clv = {}

for idx, val in data.iterrows():
    user_id = val.user_id
    total_value = val.total_value
    aov = val.aov
    n_order = val.n_order
    prob = val.prob
    
    # Calculate the present value of future purchases
    present_value = 0
    
    for i in range(1, 11):
        future_value = aov / ((1 + discount_rate)**i)
        present_value += future_value * prob**i * (1 - prob)**(10-i)
        
    # Calculate the CLV
    clv[user_id] = total_value + present_value

In [None]:
# future_value = expected_values[user_id] / ((1 + discount_rate)**i)

# Where aov is a dictionary containing the expected purchase value for each user, 
# discount_rate is a predefined rate used to discount future cash flows, 
# and i is the number of years in the future.

# Then it calculates the present value of the future purchase for that user by adding the following:
#  present_value += future_value * probabilities[user_id]**i * (1 - probabilities[user_id])**(10-i)

In [None]:
# Note: I have commented the code above since, we don't have enough data to apply this approach properly.
# Although we can do it with this data as well but it would be an overkill since we don't have enough data to generalize this stuff