In [26]:
import pandas as pd

In [27]:
# Load the CSV files, or we can use AQLite3 and make a connection to use tables
orders = pd.read_csv('/Downloads/orders.csv')
users = pd.read_csv('/Downloads/users.csv')
providers = pd.read_csv('/Downloads/providers.csv')

# Display the first few rows of each table to understand their structure
orders_head = orders.head()
users_head = users.head()
providers_head = providers.head()

orders_info = orders.info()
users_info = users.info()
providers_info = providers.info()

orders_head, users_head, providers_head, orders_info, users_info, providers_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299971 entries, 0 to 299970
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   id          299971 non-null  int64 
 1   createdAt   299971 non-null  object
 2   userId      299971 non-null  int64 
 3   quantity    299971 non-null  int64 
 4   refunded    299971 non-null  int64 
 5   currency    299971 non-null  object
 6   sales       299971 non-null  int64 
 7   providerId  299971 non-null  int64 
 8   rowid       299971 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 20.6+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358366 entries, 0 to 358365
Data columns (total 4 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              358366 non-null  int64 
 1   country         358363 non-null  object
 2   registeredDate  358366 non-null  object
 3   rowid           358366 non-null  int64 
dtypes: 

(                    id            createdAt               userId  quantity  \
 0  4648711062057701806  2023-08-31 10:14:49   833181563296211638         2   
 1  1676056141507951956  2023-03-21 17:04:54  7763311846463275691         1   
 2  7745602867536251060  2023-07-19 09:48:28  8919282109171104948         2   
 3  7319989469562109720  2023-08-10 12:29:01  5785370845306063462         1   
 4  8979946097528312402  2022-10-03 09:55:15  8918527236425591239         1   
 
    refunded currency  sales           providerId  rowid  
 0         0      eur   1000  3518867990385707647      1  
 1         0      eur    400  6413422964860176913      2  
 2         0      eur    680   123356649204044788      3  
 3         0      eur   1099  7268869293921836511      4  
 4         0      eur    200  7530970657789428790      5  ,
                     id country       registeredDate  rowid
 0  6244948894430711520      FI  2015-11-24 16:31:10      1
 1  7333245412641704133      FI  2015-11-24 16:31

In [28]:
# Merging Orders with Users on userId
merged_data = pd.merge(orders, users, left_on='userId', right_on='id', suffixes=('_order', '_user'))

# Merging the result with Providers on providerId
presentation_table = pd.merge(merged_data, providers, left_on='providerId', right_on='id', suffixes=('_user', '_provider'))

# Drop duplicate or unnecessary columns for clarity
presentation_table = presentation_table.drop(columns=['rowid_order', 'rowid_user','rowid','id_user', 'id', 'registeredDate_provider'])

presentation_table.head()

Unnamed: 0,id_order,createdAt,userId,quantity,refunded,currency,sales,providerId,country_user,registeredDate_user,defaultOfferType,country_provider
0,4648711062057701806,2023-08-31 10:14:49,833181563296211638,2,0,eur,1000,3518867990385707647,FI,2019-06-26 13:15:21,meal,fin
1,1676056141507951956,2023-03-21 17:04:54,7763311846463275691,1,0,eur,400,6413422964860176913,FI,2020-06-10 16:32:22,meal,fin
2,7745602867536251060,2023-07-19 09:48:28,8919282109171104948,2,0,eur,680,123356649204044788,FI,2021-07-08 09:49:25,snack,fin
3,7319989469562109720,2023-08-10 12:29:01,5785370845306063462,1,0,eur,1099,7268869293921836511,FI,2020-01-12 16:14:35,meal,fin
4,8979946097528312402,2022-10-03 09:55:15,8918527236425591239,1,0,eur,200,7530970657789428790,FI,2018-12-21 14:25:17,snack,fin


In [29]:
# - Top 10 Partners by Sales: 
# Group by providerId and sum the sales
top_partners_sales = presentation_table.groupby('providerId')['sales'].sum().reset_index()

# Sort in descending order and select the top 10
top_partners_sales = top_partners_sales.sort_values(by='sales', ascending=False).head(10)

top_partners_sales

Unnamed: 0,providerId,sales
2837,7198110370745783236,10917800
3216,8312310143652755348,7467750
3140,8097235958083241788,2383700
1526,3865474760205653333,2223400
3133,8084884958338058541,1868140
1853,4734853230275691017,1702100
2097,5305286819167536850,1690500
441,1066258454353124935,1568100
2988,7642201963087705313,1472000
1583,4014236829817167297,1457000


In [30]:
# - Customers' Favorite Partner Segments:
# Group by userId and defaultOfferType and count the orders
favorite_segments = presentation_table.groupby(['userId', 'defaultOfferType']).size().reset_index(name='order_count')

# Sort the orders by count in descending order to see favorite segments
favorite_segments = favorite_segments.sort_values(by='order_count', ascending=False)

favorite_segments.head(10)  

Unnamed: 0,userId,defaultOfferType,order_count
6414,402089433377497830,meal,174
7984,497424748271724727,meal,94
91657,5786196591665482994,meal,67
131382,8314107638721370903,meal,67
88207,5569149071209566419,meal,65
6415,402089433377497830,snack,62
94777,5979708096192139903,meal,62
128578,8136037968949497161,meal,60
3488,223228618215452829,meal,57
3873,247330568057287282,meal,56


In [31]:
# M1 retention for any given customer cohort
# Convert 'createdAt' to datetime format
presentation_table['createdAt'] = pd.to_datetime(presentation_table['createdAt'], errors='coerce')

# Extract the year-month of the order date
presentation_table['order_month'] = presentation_table['createdAt'].dt.to_period('M')

# Find the first order month (M0) for each user
first_order_month = presentation_table.groupby('userId')['order_month'].min().reset_index()
first_order_month.columns = ['userId', 'first_order_month']

# Merge the first order month into the presentation table
presentation_table = presentation_table.merge(first_order_month, on='userId')

# To calculate M1, we check if the user made a purchase in the month after their first order 
def is_retained(df):
    first_month = df['first_order_month'].iloc[0]
    next_month = first_month + 1  # M1 is the next month
    return int(any(df['order_month'] == next_month))

# Apply retention calculation per cohort (M0)
m1_retention = presentation_table.groupby('first_order_month').apply(is_retained).reset_index()
m1_retention.columns = ['first_order_month', 'M1_retention']

# Calculate the total number of customers in each cohort (M0)
total_customers = presentation_table.groupby('first_order_month')['userId'].nunique().reset_index()
total_customers.columns = ['first_order_month', 'total_customers']

# Merge M1 retention and total customers data
retention_data = pd.merge(m1_retention, total_customers, on='first_order_month')

# Calculate the M1 retention rate as a percentage
retention_data['M1_retention_rate'] = retention_data['M1_retention'] / retention_data['total_customers'] * 100

retention_data

  m1_retention = presentation_table.groupby('first_order_month').apply(is_retained).reset_index()


Unnamed: 0,first_order_month,M1_retention,total_customers,M1_retention_rate
0,2022-09,1,4134,0.02419
1,2022-10,1,15292,0.006539
2,2022-11,1,12559,0.007962
3,2022-12,1,8382,0.01193
4,2023-01,1,9153,0.010925
5,2023-02,1,8884,0.011256
6,2023-03,1,9585,0.010433
7,2023-04,1,8276,0.012083
8,2023-05,1,8130,0.0123
9,2023-06,1,6698,0.01493


In [32]:
#Problem #2: Customer Lifetime Value (CLV) Calculation
import pandas as pd

#CLV is a crucial metric for understanding how much value a customer brings to the platform throughout their relationship with the company. 

# Create a 'totalPrice' column 
presentation_table['totalPrice'] = presentation_table['sales']* presentation_table['quantity']

# Verify the new column
print(presentation_table[['sales', 'totalPrice']].head())


# 1: Average Order Value(VOA): Calculate the Total Revenue and Average Order Value 
total_revenue = presentation_table['totalPrice'].sum()
total_orders = presentation_table['id_order'].nunique()

# Average Order Value (AOV)
AOV = total_revenue / total_orders

print(f"Total Revenue: {total_revenue}")
print(f"Total Orders: {total_orders}")
print(f"Average Order Value (AOV): {AOV}")

# 2: Calculate Purchase Frequency (PF)
unique_customers = presentation_table['userId'].nunique()
PF = total_orders / unique_customers

print(f"Unique Customers: {unique_customers}")
print(f"Purchase Frequency (PF): {PF}")

# 3: Calculate Customer Lifespan
presentation_table['createdAt'] = pd.to_datetime(presentation_table['createdAt'], errors='coerce')

# Calculate the first and last order dates for each customer
customer_lifespan = presentation_table.groupby('userId').agg(
    first_order_date=pd.NamedAgg(column='createdAt', aggfunc='min'),
    last_order_date=pd.NamedAgg(column='createdAt', aggfunc='max')
)

# Calculate the lifespan in days for each customer
customer_lifespan['lifespan_days'] = (customer_lifespan['last_order_date'] - customer_lifespan['first_order_date']).dt.days

# Calculate the average customer lifespan in months
average_lifespan_months = customer_lifespan['lifespan_days'].mean() / 30

print(f"Average Customer Lifespan (in months): {average_lifespan_months}")

# 4: Calculate Customer Lifetime Value (CLV)
CLV = AOV * PF * average_lifespan_months

print(f"Customer Lifetime Value (CLV): {CLV}")


   sales  totalPrice
0   1000        2000
1    400         400
2    680        1360
3   1099        1099
4    200         200
Total Revenue: 425381865
Total Orders: 299971
Average Order Value (AOV): 1418.0766307409717
Unique Customers: 123208
Purchase Frequency (PF): 2.43467144990585
Average Customer Lifespan (in months): 2.5657503300651476
Customer Lifetime Value (CLV): 8858.383063822788
