In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
from itertools import product
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from math import sqrt

import warnings
warnings.filterwarnings("ignore")


In [2]:
def get_date_int(df, column):
   year = df[column].dt.year
   month = df[column].dt.month
   return year, month

In [3]:
df = pd.read_csv('df_actuals.csv',parse_dates=[0,1])
print(f"Shape: {df.shape}\n\ndtypes:\n{df.dtypes}\n")
df.head()

Shape: (3654, 18)

dtypes:
Date                             datetime64[ns]
Intake Month                     datetime64[ns]
Country                                  object
Product                                  object
Net Customers                             int64
months_since_acquisition                  int64
churn_customers                         float64
Date_month                                int64
Date_year                                 int64
Cohort_month                              int64
Cohort_year                               int64
Cohort_size                               int64
Cohort_name                              object
retention_rate                          float64
diff_retention_rate                     float64
retention_rate_last                     float64
months_since_acquisition_last             int64
Type                                     object
dtype: object



Unnamed: 0,Date,Intake Month,Country,Product,Net Customers,months_since_acquisition,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last,Type
0,2019-01-31,2019-01-31,US,A,110,0,0.0,1,2019,1,2019,110,Jan-19,100.0,0.0,51.818182,27,actual
1,2019-02-28,2019-01-31,US,A,88,1,22.0,2,2019,1,2019,110,Jan-19,80.0,-20.0,51.818182,27,actual
2,2019-03-31,2019-01-31,US,A,83,2,5.0,3,2019,1,2019,110,Jan-19,75.454545,-4.545455,51.818182,27,actual
3,2019-04-30,2019-01-31,US,A,77,3,6.0,4,2019,1,2019,110,Jan-19,70.0,-5.454545,51.818182,27,actual
4,2019-05-31,2019-01-31,US,A,75,4,2.0,5,2019,1,2019,110,Jan-19,68.181818,-1.818182,51.818182,27,actual


### Baseline: train in 2019 cohorts and predict the rest

In [4]:
##Only 2019
# date_split_past_train_valid = '2019-12-31'

date_split_past_train_valid = '2020-04-30'


df_train = df.loc[df['Intake Month']<=date_split_past_train_valid]
df_valid = df.loc[df['Intake Month']>date_split_past_train_valid]

df_train.shape

(2952, 18)

In [5]:
df_train.head(10)

Unnamed: 0,Date,Intake Month,Country,Product,Net Customers,months_since_acquisition,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last,Type
0,2019-01-31,2019-01-31,US,A,110,0,0.0,1,2019,1,2019,110,Jan-19,100.0,0.0,51.818182,27,actual
1,2019-02-28,2019-01-31,US,A,88,1,22.0,2,2019,1,2019,110,Jan-19,80.0,-20.0,51.818182,27,actual
2,2019-03-31,2019-01-31,US,A,83,2,5.0,3,2019,1,2019,110,Jan-19,75.454545,-4.545455,51.818182,27,actual
3,2019-04-30,2019-01-31,US,A,77,3,6.0,4,2019,1,2019,110,Jan-19,70.0,-5.454545,51.818182,27,actual
4,2019-05-31,2019-01-31,US,A,75,4,2.0,5,2019,1,2019,110,Jan-19,68.181818,-1.818182,51.818182,27,actual
5,2019-06-30,2019-01-31,US,A,72,5,3.0,6,2019,1,2019,110,Jan-19,65.454545,-2.727273,51.818182,27,actual
6,2019-07-31,2019-01-31,US,A,71,6,1.0,7,2019,1,2019,110,Jan-19,64.545455,-0.909091,51.818182,27,actual
7,2019-08-31,2019-01-31,US,A,70,7,1.0,8,2019,1,2019,110,Jan-19,63.636364,-0.909091,51.818182,27,actual
8,2019-09-30,2019-01-31,US,A,69,8,1.0,9,2019,1,2019,110,Jan-19,62.727273,-0.909091,51.818182,27,actual
9,2019-10-31,2019-01-31,US,A,69,9,0.0,10,2019,1,2019,110,Jan-19,62.727273,0.0,51.818182,27,actual


In [6]:
min_age_for_training = 6

mask = (df_train['months_since_acquisition']>=min_age_for_training)

##retention rate
df_train_parameters = df_train.loc[mask].copy()
df_train_parameters['retention_rate_6m'] = df_train_parameters.groupby(['Product','Country','Intake Month'])['retention_rate'].transform('max')
df_train_parameters = df_train_parameters[['Product','Country','Intake Month','retention_rate_6m','retention_rate_last','months_since_acquisition_last']].dropna().drop_duplicates()
df_train_parameters['delta_retention_last_to_6m'] = df_train_parameters['retention_rate_last'] - df_train_parameters['retention_rate_6m'] 
df_train_parameters['months_since_6m'] = df_train_parameters['months_since_acquisition_last'] - min_age_for_training
df_train_parameters['retention_slope'] = df_train_parameters['delta_retention_last_to_6m'] / df_train_parameters['months_since_6m']
df_train_parameters = df_train_parameters.reset_index(drop=True)
df_train_parameters.head(4)

Unnamed: 0,Product,Country,Intake Month,retention_rate_6m,retention_rate_last,months_since_acquisition_last,delta_retention_last_to_6m,months_since_6m,retention_slope
0,A,US,2019-01-31,64.545455,51.818182,27,-12.727273,21,-0.606061
1,A,US,2019-02-28,63.953488,52.325581,26,-11.627907,20,-0.581395
2,A,US,2019-03-31,60.869565,50.0,25,-10.869565,19,-0.572082
3,A,US,2019-04-30,58.40708,48.672566,24,-9.734513,18,-0.540806


### dates to forecast:

In [7]:
start_period = "2021-05-31"
end_period = pd.to_datetime(start_period) + pd.DateOffset(months=23)

forecast_dates = pd.date_range(start=start_period, end=end_period, freq='M')
# forecast_dates = pd.Series(date_range)

print(forecast_dates)

DatetimeIndex(['2021-05-31', '2021-06-30', '2021-07-31', '2021-08-31',
               '2021-09-30', '2021-10-31', '2021-11-30', '2021-12-31',
               '2022-01-31', '2022-02-28', '2022-03-31', '2022-04-30',
               '2022-05-31', '2022-06-30', '2022-07-31', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-31',
               '2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30'],
              dtype='datetime64[ns]', freq='M')


In [8]:
##Create forecast template
# cartesian_product_intake = list(product(forecast_dates,products, countries))
# df_forecast_intake = pd.DataFrame(cartesian_product_intake, columns=['Date', 'Product', 'Country'])
# df_forecast_intake['Intake'] = 0
# df_forecast_intake.to_csv('df_forecast_intake.csv',index=False)

# Import forecast
##Note: File 'saved_forecast_intake.csv' was generated directly in Excel by eye-balling the actuals in 2019 vs 2020
df_forecast_intake = pd.read_csv('saved_forecast_intake.csv',usecols=['Date','Country','Product','Intake','Type'],parse_dates=[0])
print(f"shape df forecast: {df_forecast_intake.shape}")
df_forecast_intake.head()

shape df forecast: (216, 5)


Unnamed: 0,Date,Country,Product,Intake,Type
0,2021-05-31,CA,A,46,Forecast
1,2021-05-31,GB,A,134,Forecast
2,2021-05-31,US,A,148,Forecast
3,2021-05-31,CA,B,33,Forecast
4,2021-05-31,GB,B,143,Forecast


In [9]:
intake_months_old_cohorts = df['Intake Month'].unique().tolist()
intake_months_new_cohorts = df_forecast_intake['Date'].unique().tolist()
intake_months = intake_months_old_cohorts + intake_months_new_cohorts

products = df['Product'].unique()
countries = df['Country'].unique()

# Create a Cartesian product of the lists
cartesian_product_net_customers = list(product(forecast_dates, intake_months, products, countries))

# Create a DataFrame from the Cartesian product
df_forecast_net_customers = pd.DataFrame(cartesian_product_net_customers, columns=['Date', 'Intake Month', 'Product', 'Country'])

df_forecast_net_customers = df_forecast_net_customers[df_forecast_net_customers['Intake Month']<=df_forecast_net_customers['Date']].reset_index(drop=True) #remove the lines where 'Intake Month' > 'Date'
df_forecast_net_customers

Unnamed: 0,Date,Intake Month,Product,Country
0,2021-05-31,2019-01-31,A,US
1,2021-05-31,2019-01-31,A,CA
2,2021-05-31,2019-01-31,A,GB
3,2021-05-31,2019-01-31,B,US
4,2021-05-31,2019-01-31,B,CA
...,...,...,...,...
8743,2023-04-30,2023-04-30,B,CA
8744,2023-04-30,2023-04-30,B,GB
8745,2023-04-30,2023-04-30,C,US
8746,2023-04-30,2023-04-30,C,CA


In [10]:
date_split_past_future = '2021-04-30'

df_forecast_net_customers['Cohort_type'] = 'old'
df_forecast_net_customers.loc[df_forecast_net_customers['Intake Month']>date_split_past_future,'Cohort_type'] = 'new'
df_forecast_net_customers['Type'] = 'forecast'

df_forecast_net_customers

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type
0,2021-05-31,2019-01-31,A,US,old,forecast
1,2021-05-31,2019-01-31,A,CA,old,forecast
2,2021-05-31,2019-01-31,A,GB,old,forecast
3,2021-05-31,2019-01-31,B,US,old,forecast
4,2021-05-31,2019-01-31,B,CA,old,forecast
...,...,...,...,...,...,...
8743,2023-04-30,2023-04-30,B,CA,new,forecast
8744,2023-04-30,2023-04-30,B,GB,new,forecast
8745,2023-04-30,2023-04-30,C,US,new,forecast
8746,2023-04-30,2023-04-30,C,CA,new,forecast


In [11]:
# df_forecast_net_customers[df_forecast_net_customers['Cohort_size'].isnull()]

In [12]:
date_year, date_month = get_date_int(df_forecast_net_customers, 'Date')

cohort_year, cohort_month = get_date_int(df_forecast_net_customers, 'Intake Month')

# Calculate difference in years
years_diff = date_year - cohort_year

# Calculate difference in months
months_diff = date_month - cohort_month

# Extract the difference in months from all previous values

df_forecast_net_customers['months_since_acquisition'] = years_diff * 12 + months_diff
df_forecast_net_customers.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition
0,2021-05-31,2019-01-31,A,US,old,forecast,28
1,2021-05-31,2019-01-31,A,CA,old,forecast,28
2,2021-05-31,2019-01-31,A,GB,old,forecast,28
3,2021-05-31,2019-01-31,B,US,old,forecast,28
4,2021-05-31,2019-01-31,B,CA,old,forecast,28


In [13]:
date_split_past_train_valid = '2019-12-31'


df_forecast_train = df_forecast_net_customers.loc[df_forecast_net_customers['Intake Month']<=date_split_past_train_valid]
df_forecast_valid = df_forecast_net_customers.loc[(df_forecast_net_customers['Intake Month']>date_split_past_train_valid)&(df_forecast_net_customers['Cohort_type']=='old')]
df_forecast_test = df_forecast_net_customers[df_forecast_net_customers['Cohort_type']=='new']

df_forecast_train

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition
0,2021-05-31,2019-01-31,A,US,old,forecast,28
1,2021-05-31,2019-01-31,A,CA,old,forecast,28
2,2021-05-31,2019-01-31,A,GB,old,forecast,28
3,2021-05-31,2019-01-31,B,US,old,forecast,28
4,2021-05-31,2019-01-31,B,CA,old,forecast,28
...,...,...,...,...,...,...,...
8383,2023-04-30,2019-12-31,B,CA,old,forecast,40
8384,2023-04-30,2019-12-31,B,GB,old,forecast,40
8385,2023-04-30,2019-12-31,C,US,old,forecast,40
8386,2023-04-30,2019-12-31,C,CA,old,forecast,40


##### Get Cohort size for the Forecast parts

In [14]:
df_forecast_train = df_forecast_train.merge(df[['Product','Country','Intake Month','Cohort_size']].drop_duplicates(),how='left',on=['Product','Country','Intake Month'])
df_forecast_valid = df_forecast_valid.merge(df[['Product','Country','Intake Month','Cohort_size']].drop_duplicates(),how='left',on=['Product','Country','Intake Month'])

df_forecast_train.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2021-05-31,2019-01-31,A,US,old,forecast,28,110
1,2021-05-31,2019-01-31,A,CA,old,forecast,28,17
2,2021-05-31,2019-01-31,A,GB,old,forecast,28,96
3,2021-05-31,2019-01-31,B,US,old,forecast,28,0
4,2021-05-31,2019-01-31,B,CA,old,forecast,28,0


In [15]:
df_forecast_test = df_forecast_test.merge(df_forecast_intake,
                        how='left',
                        left_on=['Product','Country','Intake Month'],
                        right_on=['Product','Country','Date']
).drop(['Date_y','Type_y'],axis=1).rename(columns={'Date_x':'Date','Intake':'Cohort_size','Type_x':'Type'})
df_forecast_test.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2021-05-31,2021-05-31,A,US,new,forecast,0,148
1,2021-05-31,2021-05-31,A,CA,new,forecast,0,46
2,2021-05-31,2021-05-31,A,GB,new,forecast,0,134
3,2021-05-31,2021-05-31,B,US,new,forecast,0,228
4,2021-05-31,2021-05-31,B,CA,new,forecast,0,33


In [16]:
# mask = (df_train['Product']=='A') & (df_train['Country']=='CA') & (df_train['Intake Month'].isin(['2019-01-31','2020-01-31','2022-07-31']))
# tt = df_train.loc[mask].copy()
# tt = df_train.copy()

df_forecast_train = df_forecast_train.sort_values(by=['Product','Country','Intake Month','Date']).reset_index(drop=True)
df_forecast_valid= df_forecast_valid.sort_values(by=['Product','Country','Intake Month','Date']).reset_index(drop=True)

df_forecast_train

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2021-05-31,2019-01-31,A,CA,old,forecast,28,17
1,2021-06-30,2019-01-31,A,CA,old,forecast,29,17
2,2021-07-31,2019-01-31,A,CA,old,forecast,30,17
3,2021-08-31,2019-01-31,A,CA,old,forecast,31,17
4,2021-09-30,2019-01-31,A,CA,old,forecast,32,17
...,...,...,...,...,...,...,...,...
2587,2022-12-31,2019-12-31,C,US,old,forecast,36,56
2588,2023-01-31,2019-12-31,C,US,old,forecast,37,56
2589,2023-02-28,2019-12-31,C,US,old,forecast,38,56
2590,2023-03-31,2019-12-31,C,US,old,forecast,39,56


In [17]:
df_train_parameters[['Product','Country','Intake Month','months_since_acquisition_last','retention_rate_last','retention_slope']]

Unnamed: 0,Product,Country,Intake Month,months_since_acquisition_last,retention_rate_last,retention_slope
0,A,US,2019-01-31,27,51.818182,-0.606061
1,A,US,2019-02-28,26,52.325581,-0.581395
2,A,US,2019-03-31,25,50.000000,-0.572082
3,A,US,2019-04-30,24,48.672566,-0.540806
4,A,US,2019-05-31,23,51.612903,-0.569260
...,...,...,...,...,...,...
103,C,GB,2019-12-31,16,59.183673,-0.612245
104,C,GB,2020-01-31,15,60.000000,-0.666667
105,C,GB,2020-02-29,14,62.000000,-0.750000
106,C,GB,2020-03-31,13,56.862745,-0.560224


In [18]:
# df_forecast_train.head(3)

### Forecasting part: forecast train first

In [19]:
df_forecast_train = df_forecast_train.merge(df_train_parameters[['Product','Country','Intake Month','months_since_acquisition_last','retention_rate_last','retention_slope']],how='left',on=['Product','Country','Intake Month'])
df_forecast_train['delta_months_for_forecast'] = df_forecast_train['months_since_acquisition'] - df_forecast_train['months_since_acquisition_last']
df_forecast_train['forecast_retention_rates'] = df_forecast_train['retention_rate_last'] + df_forecast_train['delta_months_for_forecast'] * df_forecast_train['retention_slope']
df_forecast_train['forecast_retention_rates'] = df_forecast_train['forecast_retention_rates']

df_forecast_train['forecast_net_customers'] = df_forecast_train['forecast_retention_rates']/100 * df_forecast_train['Cohort_size']
df_forecast_train['forecast_net_customers'] = df_forecast_train['forecast_net_customers'].fillna(0).round()


df_forecast_train.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,months_since_acquisition_last,retention_rate_last,retention_slope,delta_months_for_forecast,forecast_retention_rates,forecast_net_customers
0,2021-05-31,2019-01-31,A,CA,old,forecast,28,17,27.0,52.941176,-0.560224,1.0,52.380952,9.0
1,2021-06-30,2019-01-31,A,CA,old,forecast,29,17,27.0,52.941176,-0.560224,2.0,51.820728,9.0
2,2021-07-31,2019-01-31,A,CA,old,forecast,30,17,27.0,52.941176,-0.560224,3.0,51.260504,9.0
3,2021-08-31,2019-01-31,A,CA,old,forecast,31,17,27.0,52.941176,-0.560224,4.0,50.70028,9.0
4,2021-09-30,2019-01-31,A,CA,old,forecast,32,17,27.0,52.941176,-0.560224,5.0,50.140056,9.0


### Get retention rates for train actuals and forecasted

In [20]:
df_train_retention_rates_all = pd.concat([df_forecast_train[['Product','Country','Date','Intake Month','months_since_acquisition','forecast_retention_rates','Type']].rename(columns={'forecast_retention_rates':'retention_rate'}),
                               df_train[['Product','Country','Date','Intake Month','months_since_acquisition','retention_rate','Type']]],
                               axis=0)
df_train_retention_rates_all

Unnamed: 0,Product,Country,Date,Intake Month,months_since_acquisition,retention_rate,Type
0,A,CA,2021-05-31,2019-01-31,28,52.380952,forecast
1,A,CA,2021-06-30,2019-01-31,29,51.820728,forecast
2,A,CA,2021-07-31,2019-01-31,30,51.260504,forecast
3,A,CA,2021-08-31,2019-01-31,31,50.700280,forecast
4,A,CA,2021-09-30,2019-01-31,32,50.140056,forecast
...,...,...,...,...,...,...,...
3571,C,GB,2020-12-31,2020-04-30,8,69.230769,actual
3572,C,GB,2021-01-31,2020-04-30,9,67.307692,actual
3573,C,GB,2021-02-28,2020-04-30,10,67.307692,actual
3574,C,GB,2021-03-31,2020-04-30,11,67.307692,actual


In [21]:
df_train_retention_rates_all = df_train_retention_rates_all.sort_values(by=['Product','Country','Intake Month','Date'],ascending=True).reset_index(drop=True)

df_train_retention_rates_all['diff_retention_rate'] = df_train_retention_rates_all.groupby(['Product','Country','Intake Month'])['retention_rate'].diff()

#we fillna with 0 but only on the ones where there is a retention rate (ex: for B Product, there is no retention rate so we keep NaN)
df_train_retention_rates_all.loc[~df_train_retention_rates_all['retention_rate'].isnull(),'diff_retention_rate'] = df_train_retention_rates_all.loc[~df_train_retention_rates_all['retention_rate'].isnull(),'diff_retention_rate'].fillna(0)
# df_retention_rates.loc[~df_retention_rates['retention_rate'].isnull(),'diff_retention_rates'] 

In [22]:
# df_train_retention_rates_all.to_csv('testing_df_retention_rates_3.csv',index=False)

### Get mean retention rates per Country x Product x months since acquisition

In [23]:
fitted_diff_retention_rates = df_train_retention_rates_all.groupby(['Product','Country','months_since_acquisition'])['diff_retention_rate'].mean().ffill().reset_index() #the ffill is for Product B who hasn't have more than 40 something months since acquisition
fitted_diff_retention_rates = fitted_diff_retention_rates.rename(columns={'diff_retention_rate':'fitted_diff_retention_rate'})
fitted_diff_retention_rates.head()

Unnamed: 0,Product,Country,months_since_acquisition,fitted_diff_retention_rate
0,A,CA,0,0.0
1,A,CA,1,-23.863112
2,A,CA,2,-4.192586
3,A,CA,3,-5.026914
4,A,CA,4,-1.612168


In [24]:
fitted_diff_retention_rates[fitted_diff_retention_rates['Product']=='B']

Unnamed: 0,Product,Country,months_since_acquisition,fitted_diff_retention_rate
156,B,CA,0,0.000000
157,B,CA,1,0.000000
158,B,CA,2,0.000000
159,B,CA,3,-32.056817
160,B,CA,4,-1.006441
...,...,...,...,...
307,B,US,47,-0.663206
308,B,US,48,-0.663206
309,B,US,49,-0.663206
310,B,US,50,-0.663206


### Forecast valid

In [25]:
df_valid.shape
df_valid.head(5)

Unnamed: 0,Date,Intake Month,Country,Product,Net Customers,months_since_acquisition,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last,Type
328,2020-05-31,2020-05-31,US,A,136,0,0.0,5,2020,5,2020,136,May-20,100.0,0.0,61.029412,11,actual
329,2020-06-30,2020-05-31,US,A,109,1,27.0,6,2020,5,2020,136,May-20,80.147059,-19.852941,61.029412,11,actual
330,2020-07-31,2020-05-31,US,A,102,2,7.0,7,2020,5,2020,136,May-20,75.0,-5.147059,61.029412,11,actual
331,2020-08-31,2020-05-31,US,A,95,3,7.0,8,2020,5,2020,136,May-20,69.852941,-5.147059,61.029412,11,actual
332,2020-09-30,2020-05-31,US,A,92,4,3.0,9,2020,5,2020,136,May-20,67.647059,-2.205882,61.029412,11,actual


In [26]:
df_forecast_valid.shape
df_forecast_valid.head(5)

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2021-05-31,2020-01-31,A,CA,old,forecast,16,29
1,2021-06-30,2020-01-31,A,CA,old,forecast,17,29
2,2021-07-31,2020-01-31,A,CA,old,forecast,18,29
3,2021-08-31,2020-01-31,A,CA,old,forecast,19,29
4,2021-09-30,2020-01-31,A,CA,old,forecast,20,29


In [27]:
# tt = df_forecast_valid.merge(fitted_diff_retention_rates,how='left',on=['Product','Country','months_since_acquisition'])
# tt[(tt.Product=='B')&(tt['Intake Month']=='2020-05-31')].sort_values(by='months_since_acquisition')

In [28]:
df_valid_parameters = df_valid.copy()
df_valid_parameters = df_valid_parameters[['Product','Country','Intake Month','retention_rate_last','months_since_acquisition_last']].dropna().drop_duplicates()
df_valid_parameters.head(5)

Unnamed: 0,Product,Country,Intake Month,retention_rate_last,months_since_acquisition_last
328,A,US,2020-05-31,61.029412,11
340,A,US,2020-06-30,58.267717,10
351,A,US,2020-07-31,68.292683,9
361,A,US,2020-08-31,69.230769,8
370,A,US,2020-09-30,62.727273,7


In [29]:
df_forecast_valid = df_forecast_valid.merge(df_valid_parameters,how='left',on=['Product','Country','Intake Month'])
df_forecast_valid = df_forecast_valid.merge(fitted_diff_retention_rates,how='left',on=['Product','Country','months_since_acquisition'])

df_forecast_valid['cum_fitted_retention_rate'] = df_forecast_valid.groupby(['Product','Country','Intake Month'])['fitted_diff_retention_rate'].cumsum()
df_forecast_valid['forecast_retention_rates'] = df_forecast_valid['retention_rate_last'] + df_forecast_valid['cum_fitted_retention_rate'] 
df_forecast_valid['forecast_net_customers'] = df_forecast_valid['forecast_retention_rates']/100 * df_forecast_valid['Cohort_size']
df_forecast_valid['forecast_net_customers'] = df_forecast_valid['forecast_net_customers'].fillna(0).round()

df_forecast_valid

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,retention_rate_last,months_since_acquisition_last,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers
0,2021-05-31,2020-01-31,A,CA,old,forecast,16,29,,,-1.354167,-1.354167,,0.0
1,2021-06-30,2020-01-31,A,CA,old,forecast,17,29,,,-1.023046,-2.377212,,0.0
2,2021-07-31,2020-01-31,A,CA,old,forecast,18,29,,,-0.972116,-3.349328,,0.0
3,2021-08-31,2020-01-31,A,CA,old,forecast,19,29,,,-0.156551,-3.505879,,0.0
4,2021-09-30,2020-01-31,A,CA,old,forecast,20,29,,,-0.214826,-3.720705,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3451,2022-12-31,2021-04-30,C,US,old,forecast,20,63,100.0,0.0,-0.516456,-45.435055,54.564945,34.0
3452,2023-01-31,2021-04-30,C,US,old,forecast,21,63,100.0,0.0,-0.881060,-46.316116,53.683884,34.0
3453,2023-02-28,2021-04-30,C,US,old,forecast,22,63,100.0,0.0,-0.458994,-46.775109,53.224891,34.0
3454,2023-03-31,2021-04-30,C,US,old,forecast,23,63,100.0,0.0,-0.504861,-47.279970,52.720030,33.0


##### Save valid df with to csv

In [30]:
# test_valid_output = pd.concat([df_valid[cols],df_forecast_valid[['Product','Country','Intake Month','Date','forecast_retention_rates','Type','forecast_net_customers']]],axis=0)
# test_valid_output['retention_rate'] = np.where(test_valid_output['retention_rate'].isnull(),test_valid_output['forecast_retention_rates'],test_valid_output['retention_rate'])
# test_valid_output['Net Customers'] = np.where(test_valid_output['Net Customers'].isnull(),test_valid_output['forecast_net_customers'],test_valid_output['Net Customers'])

# test_valid_output = test_valid_output.drop(['forecast_retention_rates','forecast_net_customers'],axis=1)
# test_valid_output
# test_valid_output.to_csv('test_valid_output.csv',index=False)

#### Forecast test

In [31]:
df_forecast_valid.head(3)

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,retention_rate_last,months_since_acquisition_last,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers
0,2021-05-31,2020-01-31,A,CA,old,forecast,16,29,,,-1.354167,-1.354167,,0.0
1,2021-06-30,2020-01-31,A,CA,old,forecast,17,29,,,-1.023046,-2.377212,,0.0
2,2021-07-31,2020-01-31,A,CA,old,forecast,18,29,,,-0.972116,-3.349328,,0.0


In [32]:
# df_forecast_test.sort_values(by=['months_since_acquisition'])

In [33]:
df_forecast_test['retention_rate_last'] = 100
df_forecast_test['months_since_acquisition_last'] = 0

df_forecast_test = df_forecast_test.merge(fitted_diff_retention_rates,how='left',on=['Product','Country','months_since_acquisition'])

df_forecast_test['cum_fitted_retention_rate'] = df_forecast_test.groupby(['Product','Country','Intake Month'])['fitted_diff_retention_rate'].cumsum()
df_forecast_test['forecast_retention_rates'] = df_forecast_test['retention_rate_last'] + df_forecast_test['cum_fitted_retention_rate'] 
df_forecast_test['forecast_net_customers'] = df_forecast_test['forecast_retention_rates']/100 * df_forecast_test['Cohort_size']
df_forecast_test['forecast_net_customers'] = df_forecast_test['forecast_net_customers'].fillna(0).round()

df_forecast_test

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,retention_rate_last,months_since_acquisition_last,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers
0,2021-05-31,2021-05-31,A,US,new,forecast,0,148,100,0,0.0,0.0,100.0,148.0
1,2021-05-31,2021-05-31,A,CA,new,forecast,0,46,100,0,0.0,0.0,100.0,46.0
2,2021-05-31,2021-05-31,A,GB,new,forecast,0,134,100,0,0.0,0.0,100.0,134.0
3,2021-05-31,2021-05-31,B,US,new,forecast,0,228,100,0,0.0,0.0,100.0,228.0
4,2021-05-31,2021-05-31,B,CA,new,forecast,0,33,100,0,0.0,0.0,100.0,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,2023-04-30,2023-04-30,B,CA,new,forecast,0,0,100,0,0.0,0.0,100.0,0.0
2696,2023-04-30,2023-04-30,B,GB,new,forecast,0,0,100,0,0.0,0.0,100.0,0.0
2697,2023-04-30,2023-04-30,C,US,new,forecast,0,75,100,0,0.0,0.0,100.0,75.0
2698,2023-04-30,2023-04-30,C,CA,new,forecast,0,12,100,0,0.0,0.0,100.0,12.0


In [34]:
# df_forecast_test.to_csv('test_test_output.csv',index=False)

In [35]:
cols = ['Date','Intake Month','Product','Country','Cohort_type','Type','months_since_acquisition','Cohort_size','forecast_retention_rates','forecast_net_customers']

In [36]:
df_submission = pd.concat([df_forecast_train[cols],df_forecast_valid[cols],df_forecast_test[cols]],axis=0)
df_submission.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,forecast_retention_rates,forecast_net_customers
0,2021-05-31,2019-01-31,A,CA,old,forecast,28,17,52.380952,9.0
1,2021-06-30,2019-01-31,A,CA,old,forecast,29,17,51.820728,9.0
2,2021-07-31,2019-01-31,A,CA,old,forecast,30,17,51.260504,9.0
3,2021-08-31,2019-01-31,A,CA,old,forecast,31,17,50.70028,9.0
4,2021-09-30,2019-01-31,A,CA,old,forecast,32,17,50.140056,9.0


In [37]:
# df_submission.to_csv('forecast_net_customers.csv',index=False)

In [38]:
cols = ['Product','Country','Intake Month','Date','retention_rate','Type','Net Customers']

df_all = pd.concat([df_train[cols]
           ,df_valid[cols]
           ,df_forecast_train[['Product','Country','Intake Month','Date','forecast_retention_rates','Type','forecast_net_customers']]
           ,df_forecast_valid[['Product','Country','Intake Month','Date','forecast_retention_rates','Type','forecast_net_customers']]
           ,df_forecast_test[['Product','Country','Intake Month','Date','forecast_retention_rates','Type','forecast_net_customers']]
]
          ,axis=0)

df_all['retention_rate'] = np.where(df_all['retention_rate'].isnull(),df_all['forecast_retention_rates'],df_all['retention_rate'])
df_all['Net Customers'] = np.where(df_all['Net Customers'].isnull(),df_all['forecast_net_customers'],df_all['Net Customers'])

df_all = df_all.drop(['forecast_retention_rates','forecast_net_customers'],axis=1)
print(f"shape: {df_all.shape}")
df_all

shape: (12402, 7)


Unnamed: 0,Product,Country,Intake Month,Date,retention_rate,Type,Net Customers
0,A,US,2019-01-31,2019-01-31,100.000000,actual,110.0
1,A,US,2019-01-31,2019-02-28,80.000000,actual,88.0
2,A,US,2019-01-31,2019-03-31,75.454545,actual,83.0
3,A,US,2019-01-31,2019-04-30,70.000000,actual,77.0
4,A,US,2019-01-31,2019-05-31,68.181818,actual,75.0
...,...,...,...,...,...,...,...
2695,B,CA,2023-04-30,2023-04-30,100.000000,forecast,0.0
2696,B,GB,2023-04-30,2023-04-30,100.000000,forecast,0.0
2697,C,US,2023-04-30,2023-04-30,100.000000,forecast,75.0
2698,C,CA,2023-04-30,2023-04-30,100.000000,forecast,12.0


In [39]:
# df_all.to_csv('df_all_net_customers.csv',index=False)

### CV

###### Here we treat valid as 'test' to see the performance of the model with MAE and RMSE metrics

In [73]:
start_period_cv = "2020-05-31"
end_period_cv = pd.to_datetime(start_period_cv) + pd.DateOffset(months=11)

forecast_dates_cv = pd.date_range(start=start_period_cv, end=end_period_cv, freq='M')
# forecast_dates = pd.Series(date_range)

print(forecast_dates_cv)

DatetimeIndex(['2020-05-31', '2020-06-30', '2020-07-31', '2020-08-31',
               '2020-09-30', '2020-10-31', '2020-11-30', '2020-12-31',
               '2021-01-31', '2021-02-28', '2021-03-31', '2021-04-30'],
              dtype='datetime64[ns]', freq='M')


In [74]:
df_train['Intake Month'].max()

Timestamp('2020-04-30 00:00:00')

In [75]:
df_forecast_intake_cv = df_valid[df_valid['months_since_acquisition']==0].copy()

df_forecast_intake_cv['retention_rate_last'] = 100
df_forecast_intake_cv['months_since_acquisition_last'] = 0
df_forecast_intake_cv.head()

Unnamed: 0,Date,Intake Month,Country,Product,Net Customers,months_since_acquisition,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last,Type
328,2020-05-31,2020-05-31,US,A,136,0,0.0,5,2020,5,2020,136,May-20,100.0,0.0,100,0,actual
340,2020-06-30,2020-06-30,US,A,127,0,0.0,6,2020,6,2020,127,Jun-20,100.0,0.0,100,0,actual
351,2020-07-31,2020-07-31,US,A,123,0,0.0,7,2020,7,2020,123,Jul-20,100.0,0.0,100,0,actual
361,2020-08-31,2020-08-31,US,A,104,0,0.0,8,2020,8,2020,104,Aug-20,100.0,0.0,100,0,actual
370,2020-09-30,2020-09-30,US,A,110,0,0.0,9,2020,9,2020,110,Sep-20,100.0,0.0,100,0,actual


In [76]:
intake_months_old_cohorts_cv = df_train['Intake Month'].unique().tolist()
intake_months_new_cohorts_cv = df_forecast_intake_cv['Date'].unique().tolist()
intake_months_cv = intake_months_old_cohorts_cv + intake_months_new_cohorts_cv

products = df_train['Product'].unique()
countries = df_train['Country'].unique()

# Create a Cartesian product of the lists
cartesian_product_net_customers_cv = list(product(forecast_dates_cv, intake_months_cv, products, countries))

# Create a DataFrame from the Cartesian product
df_forecast_net_customers_cv = pd.DataFrame(cartesian_product_net_customers_cv, columns=['Date', 'Intake Month', 'Product', 'Country'])

df_forecast_net_customers_cv = df_forecast_net_customers_cv[df_forecast_net_customers_cv['Intake Month']<=df_forecast_net_customers_cv['Date']].reset_index(drop=True) #remove the lines where 'Intake Month' > 'Date'
df_forecast_net_customers_cv

Unnamed: 0,Date,Intake Month,Product,Country
0,2020-05-31,2019-01-31,A,US
1,2020-05-31,2019-01-31,A,CA
2,2020-05-31,2019-01-31,A,GB
3,2020-05-31,2019-01-31,B,US
4,2020-05-31,2019-01-31,B,CA
...,...,...,...,...
2425,2021-04-30,2021-04-30,B,CA
2426,2021-04-30,2021-04-30,B,GB
2427,2021-04-30,2021-04-30,C,US
2428,2021-04-30,2021-04-30,C,CA


In [77]:
date_split_past_future_cv = '2020-04-30'

df_forecast_net_customers_cv['Cohort_type'] = 'old'
df_forecast_net_customers_cv.loc[df_forecast_net_customers_cv['Intake Month']>date_split_past_future_cv,'Cohort_type'] = 'new'
df_forecast_net_customers_cv['Type'] = 'forecast'

df_forecast_net_customers_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type
0,2020-05-31,2019-01-31,A,US,old,forecast
1,2020-05-31,2019-01-31,A,CA,old,forecast
2,2020-05-31,2019-01-31,A,GB,old,forecast
3,2020-05-31,2019-01-31,B,US,old,forecast
4,2020-05-31,2019-01-31,B,CA,old,forecast
...,...,...,...,...,...,...
2425,2021-04-30,2021-04-30,B,CA,new,forecast
2426,2021-04-30,2021-04-30,B,GB,new,forecast
2427,2021-04-30,2021-04-30,C,US,new,forecast
2428,2021-04-30,2021-04-30,C,CA,new,forecast


In [90]:
date_year, date_month = get_date_int(df_forecast_net_customers_cv, 'Date')

cohort_year, cohort_month = get_date_int(df_forecast_net_customers_cv, 'Intake Month')

# Calculate difference in years
years_diff = date_year - cohort_year

# Calculate difference in months
months_diff = date_month - cohort_month

# Extract the difference in months from all previous values

df_forecast_net_customers_cv['months_since_acquisition'] = years_diff * 12 + months_diff
df_forecast_net_customers_cv.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition
0,2020-05-31,2019-01-31,A,US,old,forecast,16
1,2020-05-31,2019-01-31,A,CA,old,forecast,16
2,2020-05-31,2019-01-31,A,GB,old,forecast,16
3,2020-05-31,2019-01-31,B,US,old,forecast,16
4,2020-05-31,2019-01-31,B,CA,old,forecast,16


In [114]:
# df_forecast_net_customers_cv.loc[(df_forecast_net_customers_cv['Intake Month']>date_split_past_future_cv)]

In [92]:
# date_split_past_train_valid_cv = '2020-04-30'
date_split_past_future_cv

df_forecast_train_cv = df_forecast_net_customers_cv.loc[df_forecast_net_customers_cv['Intake Month']<=date_split_past_future_cv]
df_forecast_valid_cv = df_forecast_net_customers_cv.loc[(df_forecast_net_customers_cv['Intake Month']>date_split_past_future_cv)&(df_forecast_net_customers_cv['Cohort_type']=='old')]
df_forecast_test_cv = df_forecast_net_customers_cv[df_forecast_net_customers_cv['Cohort_type']=='new']

df_forecast_train_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition
0,2020-05-31,2019-01-31,A,US,old,forecast,16
1,2020-05-31,2019-01-31,A,CA,old,forecast,16
2,2020-05-31,2019-01-31,A,GB,old,forecast,16
3,2020-05-31,2019-01-31,B,US,old,forecast,16
4,2020-05-31,2019-01-31,B,CA,old,forecast,16
...,...,...,...,...,...,...,...
2317,2021-04-30,2020-04-30,B,CA,old,forecast,12
2318,2021-04-30,2020-04-30,B,GB,old,forecast,12
2319,2021-04-30,2020-04-30,C,US,old,forecast,12
2320,2021-04-30,2020-04-30,C,CA,old,forecast,12


In [93]:
df_forecast_train_cv = df_forecast_train_cv.merge(df_train[['Product','Country','Intake Month','Cohort_size']].drop_duplicates(),how='left',on=['Product','Country','Intake Month'])
df_forecast_valid_cv = df_forecast_valid_cv.merge(df_train[['Product','Country','Intake Month','Cohort_size']].drop_duplicates(),how='left',on=['Product','Country','Intake Month'])

df_forecast_train_cv.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2020-05-31,2019-01-31,A,US,old,forecast,16,110
1,2020-05-31,2019-01-31,A,CA,old,forecast,16,17
2,2020-05-31,2019-01-31,A,GB,old,forecast,16,96
3,2020-05-31,2019-01-31,B,US,old,forecast,16,0
4,2020-05-31,2019-01-31,B,CA,old,forecast,16,0


In [94]:
df_forecast_test_cv = df_forecast_test_cv.merge(df_forecast_intake_cv,
                        how='left',
                        left_on=['Product','Country','Intake Month'],
                        right_on=['Product','Country','Date']
).drop(['Date_y','Type_y','Intake Month_y'],axis=1).rename(columns={'Date_x':'Date','Intake':'Cohort_size','Type_x':'Type','Intake Month_x':'Intake Month'})

df_forecast_test_cv.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition_x,Net Customers,months_since_acquisition_y,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last
0,2020-05-31,2020-05-31,A,US,new,forecast,0,136,0,0.0,5,2020,5,2020,136,May-20,100.0,0.0,100,0
1,2020-05-31,2020-05-31,A,CA,new,forecast,0,34,0,0.0,5,2020,5,2020,34,May-20,100.0,0.0,100,0
2,2020-05-31,2020-05-31,A,GB,new,forecast,0,125,0,0.0,5,2020,5,2020,125,May-20,100.0,0.0,100,0
3,2020-05-31,2020-05-31,B,US,new,forecast,0,216,0,0.0,5,2020,5,2020,216,May-20,100.0,0.0,100,0
4,2020-05-31,2020-05-31,B,CA,new,forecast,0,32,0,0.0,5,2020,5,2020,32,May-20,100.0,0.0,100,0


In [95]:
df_forecast_train_cv = df_forecast_train_cv.sort_values(by=['Product','Country','Intake Month','Date']).reset_index(drop=True)
df_forecast_valid_cv= df_forecast_valid_cv.sort_values(by=['Product','Country','Intake Month','Date']).reset_index(drop=True)

df_forecast_train_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17
...,...,...,...,...,...,...,...,...
1723,2020-12-31,2020-04-30,C,US,old,forecast,8,58
1724,2021-01-31,2020-04-30,C,US,old,forecast,9,58
1725,2021-02-28,2020-04-30,C,US,old,forecast,10,58
1726,2021-03-31,2020-04-30,C,US,old,forecast,11,58


In [98]:
date_split_past_train_valid

'2019-12-31'

In [99]:
# date_split_past_train_valid = '2019-12-31'

# df_train_cv = df_train.loc[df['Intake Month']<=date_split_past_train_valid]
# df_valid_cv = df_train.loc[df['Intake Month']>date_split_past_train_valid]


df_train_cv = df_train.copy()
df_valid_cv = df_train.copy()

df_train_cv.shape

(2952, 18)

In [89]:
# df_train_cv

In [100]:
df_valid_cv.shape

(2952, 18)

In [101]:
min_age_for_training = 6

mask = (df_train_cv['months_since_acquisition']>=min_age_for_training)

##retention rate
df_train_parameters_cv = df_train_cv.loc[mask].copy()
df_train_parameters_cv['retention_rate_6m'] = df_train_parameters_cv.groupby(['Product','Country','Intake Month'])['retention_rate'].transform('max')
df_train_parameters_cv = df_train_parameters_cv[['Product','Country','Intake Month','retention_rate_6m','retention_rate_last','months_since_acquisition_last']].dropna().drop_duplicates()
df_train_parameters_cv['delta_retention_last_to_6m'] = df_train_parameters_cv['retention_rate_last'] - df_train_parameters_cv['retention_rate_6m'] 
df_train_parameters_cv['months_since_6m'] = df_train_parameters_cv['months_since_acquisition_last'] - min_age_for_training
df_train_parameters_cv['retention_slope'] = df_train_parameters_cv['delta_retention_last_to_6m'] / df_train_parameters_cv['months_since_6m']
df_train_parameters_cv = df_train_parameters_cv.reset_index(drop=True)
df_train_parameters_cv.head(4)

Unnamed: 0,Product,Country,Intake Month,retention_rate_6m,retention_rate_last,months_since_acquisition_last,delta_retention_last_to_6m,months_since_6m,retention_slope
0,A,US,2019-01-31,64.545455,51.818182,27,-12.727273,21,-0.606061
1,A,US,2019-02-28,63.953488,52.325581,26,-11.627907,20,-0.581395
2,A,US,2019-03-31,60.869565,50.0,25,-10.869565,19,-0.572082
3,A,US,2019-04-30,58.40708,48.672566,24,-9.734513,18,-0.540806


In [102]:
df_forecast_train_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17
...,...,...,...,...,...,...,...,...
1723,2020-12-31,2020-04-30,C,US,old,forecast,8,58
1724,2021-01-31,2020-04-30,C,US,old,forecast,9,58
1725,2021-02-28,2020-04-30,C,US,old,forecast,10,58
1726,2021-03-31,2020-04-30,C,US,old,forecast,11,58


In [103]:
df_forecast_train_cv = df_forecast_train_cv.merge(df_train_parameters[['Product','Country','Intake Month','months_since_acquisition_last','retention_rate_last','retention_slope']],how='left',on=['Product','Country','Intake Month'])
df_forecast_train_cv['delta_months_for_forecast'] = df_forecast_train_cv['months_since_acquisition'] - df_forecast_train_cv['months_since_acquisition_last']
df_forecast_train_cv['forecast_retention_rates'] = df_forecast_train_cv['retention_rate_last'] + df_forecast_train_cv['delta_months_for_forecast'] * df_forecast_train_cv['retention_slope']
df_forecast_train_cv['forecast_retention_rates'] = df_forecast_train_cv['forecast_retention_rates']

df_forecast_train_cv['forecast_net_customers'] = df_forecast_train_cv['forecast_retention_rates']/100 * df_forecast_train_cv['Cohort_size']
df_forecast_train_cv['forecast_net_customers'] = df_forecast_train_cv['forecast_net_customers'].fillna(0).round()


df_forecast_train_cv.head()

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,months_since_acquisition_last,retention_rate_last,retention_slope,delta_months_for_forecast,forecast_retention_rates,forecast_net_customers
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17,27.0,52.941176,-0.560224,-11.0,59.103641,10.0
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17,27.0,52.941176,-0.560224,-10.0,58.543417,10.0
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17,27.0,52.941176,-0.560224,-9.0,57.983193,10.0
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17,27.0,52.941176,-0.560224,-8.0,57.422969,10.0
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17,27.0,52.941176,-0.560224,-7.0,56.862745,10.0


In [104]:
df_train_retention_rates_all_cv = pd.concat([df_forecast_train_cv[['Product','Country','Date','Intake Month','months_since_acquisition','forecast_retention_rates','Type']].rename(columns={'forecast_retention_rates':'retention_rate'}),
                               df_train_cv[['Product','Country','Date','Intake Month','months_since_acquisition','retention_rate','Type']]],
                               axis=0)
df_train_retention_rates_all_cv

Unnamed: 0,Product,Country,Date,Intake Month,months_since_acquisition,retention_rate,Type
0,A,CA,2020-05-31,2019-01-31,16,59.103641,forecast
1,A,CA,2020-06-30,2019-01-31,17,58.543417,forecast
2,A,CA,2020-07-31,2019-01-31,18,57.983193,forecast
3,A,CA,2020-08-31,2019-01-31,19,57.422969,forecast
4,A,CA,2020-09-30,2019-01-31,20,56.862745,forecast
...,...,...,...,...,...,...,...
3571,C,GB,2020-12-31,2020-04-30,8,69.230769,actual
3572,C,GB,2021-01-31,2020-04-30,9,67.307692,actual
3573,C,GB,2021-02-28,2020-04-30,10,67.307692,actual
3574,C,GB,2021-03-31,2020-04-30,11,67.307692,actual


In [105]:
df_train_retention_rates_all_cv = df_train_retention_rates_all_cv.sort_values(by=['Product','Country','Intake Month','Date'],ascending=True).reset_index(drop=True)

df_train_retention_rates_all_cv['diff_retention_rate'] = df_train_retention_rates_all_cv.groupby(['Product','Country','Intake Month'])['retention_rate'].diff()

#we fillna with 0 but only on the ones where there is a retention rate (ex: for B Product, there is no retention rate so we keep NaN)
df_train_retention_rates_all_cv.loc[~df_train_retention_rates_all_cv['retention_rate'].isnull(),'diff_retention_rate'] = df_train_retention_rates_all_cv.loc[~df_train_retention_rates_all_cv['retention_rate'].isnull(),'diff_retention_rate'].fillna(0)
# df_retention_rates.loc[~df_retention_rates['retention_rate'].isnull(),'diff_retention_rates'] 

In [106]:
fitted_diff_retention_rates_cv = df_train_retention_rates_all_cv.groupby(['Product','Country','months_since_acquisition'])['diff_retention_rate'].mean().ffill().reset_index() #the ffill is for Product B who hasn't have more than 40 something months since acquisition
fitted_diff_retention_rates_cv = fitted_diff_retention_rates_cv.rename(columns={'diff_retention_rate':'fitted_diff_retention_rate'})
fitted_diff_retention_rates_cv.head()

Unnamed: 0,Product,Country,months_since_acquisition,fitted_diff_retention_rate
0,A,CA,0,0.0
1,A,CA,1,-22.459399
2,A,CA,2,-3.726743
3,A,CA,3,-4.233191
4,A,CA,4,-1.289734


In [107]:
#check
fitted_diff_retention_rates[fitted_diff_retention_rates['Product']=='B']

Unnamed: 0,Product,Country,months_since_acquisition,fitted_diff_retention_rate
156,B,CA,0,0.000000
157,B,CA,1,0.000000
158,B,CA,2,0.000000
159,B,CA,3,-32.056817
160,B,CA,4,-1.006441
...,...,...,...,...
307,B,US,47,-0.663206
308,B,US,48,-0.663206
309,B,US,49,-0.663206
310,B,US,50,-0.663206


In [108]:
df_valid_parameters_cv = df_valid_cv.copy()
df_valid_parameters_cv = df_valid_parameters_cv[['Product','Country','Intake Month','retention_rate_last','months_since_acquisition_last']].dropna().drop_duplicates()
df_valid_parameters_cv.head(5)

Unnamed: 0,Product,Country,Intake Month,retention_rate_last,months_since_acquisition_last
0,A,US,2019-01-31,51.818182,27
28,A,US,2019-02-28,52.325581,26
55,A,US,2019-03-31,50.0,25
81,A,US,2019-04-30,48.672566,24
106,A,US,2019-05-31,51.612903,23


In [111]:
df_forecast_valid_cv

Unnamed: 0,Date,Cohort_type,Type,Cohort_size,Intake Month,retention_rate_last,months_since_acquisition_last,Product,Country,months_since_acquisition,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers


In [110]:
df_forecast_valid_cv.merge(df_valid_parameters_cv,how='left',on=['Product','Country','Intake Month'])

Unnamed: 0,Date,Cohort_type,Type,Cohort_size,retention_rate_last_x,months_since_acquisition_last_x,months_since_acquisition,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers,Product,Country,Intake Month,retention_rate_last_y,months_since_acquisition_last_y


In [109]:
df_forecast_valid_cv = df_forecast_valid_cv.merge(df_valid_parameters_cv,how='left',on=['Product','Country','Intake Month'])
df_forecast_valid_cv = df_forecast_valid_cv.merge(fitted_diff_retention_rates_cv,how='left',on=['Product','Country','months_since_acquisition'])

df_forecast_valid_cv['cum_fitted_retention_rate'] = df_forecast_valid_cv.groupby(['Product','Country','Intake Month'])['fitted_diff_retention_rate'].cumsum()
df_forecast_valid_cv['forecast_retention_rates'] = df_forecast_valid_cv['retention_rate_last'] + df_forecast_valid_cv['cum_fitted_retention_rate'] 
df_forecast_valid_cv['forecast_net_customers'] = df_forecast_valid_cv['forecast_retention_rates']/100 * df_forecast_valid_cv['Cohort_size']
df_forecast_valid_cv['forecast_net_customers'] = df_forecast_valid_cv['forecast_net_customers'].fillna(0).round()

df_forecast_valid_cv

Unnamed: 0,Date,Cohort_type,Type,Cohort_size,Intake Month,retention_rate_last,months_since_acquisition_last,Product,Country,months_since_acquisition,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers


In [59]:
df_forecast_test_cv = df_forecast_test_cv.drop('months_since_acquisition_y',axis=1).rename(columns={'months_since_acquisition_x':'months_since_acquisition'})
df_forecast_test_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Net Customers,churn_customers,Date_month,Date_year,Cohort_month,Cohort_year,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last
0,2020-05-31,2020-05-31,A,US,new,forecast,0,136,0.0,5,2020,5,2020,136,May-20,100.0,0.0,100,0
1,2020-05-31,2020-05-31,A,CA,new,forecast,0,34,0.0,5,2020,5,2020,34,May-20,100.0,0.0,100,0
2,2020-05-31,2020-05-31,A,GB,new,forecast,0,125,0.0,5,2020,5,2020,125,May-20,100.0,0.0,100,0
3,2020-05-31,2020-05-31,B,US,new,forecast,0,216,0.0,5,2020,5,2020,216,May-20,100.0,0.0,100,0
4,2020-05-31,2020-05-31,B,CA,new,forecast,0,32,0.0,5,2020,5,2020,32,May-20,100.0,0.0,100,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,2021-04-30,2021-04-30,B,CA,new,forecast,0,0,0.0,4,2021,4,2021,0,Apr-21,,,100,0
698,2021-04-30,2021-04-30,B,GB,new,forecast,0,0,0.0,4,2021,4,2021,0,Apr-21,,,100,0
699,2021-04-30,2021-04-30,C,US,new,forecast,0,63,0.0,4,2021,4,2021,63,Apr-21,100.0,0.0,100,0
700,2021-04-30,2021-04-30,C,CA,new,forecast,0,10,0.0,4,2021,4,2021,10,Apr-21,100.0,0.0,100,0


In [60]:
df_forecast_test_cv['retention_rate_last'] = 100
df_forecast_test_cv['months_since_acquisition_last'] = 0

df_forecast_test_cv = df_forecast_test_cv.merge(fitted_diff_retention_rates,how='left',on=['Product','Country','months_since_acquisition'])

df_forecast_test_cv['cum_fitted_retention_rate'] = df_forecast_test_cv.groupby(['Product','Country','Intake Month'])['fitted_diff_retention_rate'].cumsum()
df_forecast_test_cv['forecast_retention_rates'] = df_forecast_test_cv['retention_rate_last'] + df_forecast_test_cv['cum_fitted_retention_rate'] 
df_forecast_test_cv['forecast_net_customers'] = df_forecast_test_cv['forecast_retention_rates']/100 * df_forecast_test_cv['Cohort_size']
df_forecast_test_cv['forecast_net_customers'] = df_forecast_test_cv['forecast_net_customers'].fillna(0).round()

df_forecast_test_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Net Customers,churn_customers,Date_month,...,Cohort_size,Cohort_name,retention_rate,diff_retention_rate,retention_rate_last,months_since_acquisition_last,fitted_diff_retention_rate,cum_fitted_retention_rate,forecast_retention_rates,forecast_net_customers
0,2020-05-31,2020-05-31,A,US,new,forecast,0,136,0.0,5,...,136,May-20,100.0,0.0,100,0,0.0,0.0,100.0,136.0
1,2020-05-31,2020-05-31,A,CA,new,forecast,0,34,0.0,5,...,34,May-20,100.0,0.0,100,0,0.0,0.0,100.0,34.0
2,2020-05-31,2020-05-31,A,GB,new,forecast,0,125,0.0,5,...,125,May-20,100.0,0.0,100,0,0.0,0.0,100.0,125.0
3,2020-05-31,2020-05-31,B,US,new,forecast,0,216,0.0,5,...,216,May-20,100.0,0.0,100,0,0.0,0.0,100.0,216.0
4,2020-05-31,2020-05-31,B,CA,new,forecast,0,32,0.0,5,...,32,May-20,100.0,0.0,100,0,0.0,0.0,100.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
697,2021-04-30,2021-04-30,B,CA,new,forecast,0,0,0.0,4,...,0,Apr-21,,,100,0,0.0,0.0,100.0,0.0
698,2021-04-30,2021-04-30,B,GB,new,forecast,0,0,0.0,4,...,0,Apr-21,,,100,0,0.0,0.0,100.0,0.0
699,2021-04-30,2021-04-30,C,US,new,forecast,0,63,0.0,4,...,63,Apr-21,100.0,0.0,100,0,0.0,0.0,100.0,63.0
700,2021-04-30,2021-04-30,C,CA,new,forecast,0,10,0.0,4,...,10,Apr-21,100.0,0.0,100,0,0.0,0.0,100.0,10.0


In [61]:
df_forecast_train_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,months_since_acquisition_last,retention_rate_last,retention_slope,delta_months_for_forecast,forecast_retention_rates,forecast_net_customers
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17,27.0,52.941176,-0.560224,-11.0,59.103641,10.0
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17,27.0,52.941176,-0.560224,-10.0,58.543417,10.0
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17,27.0,52.941176,-0.560224,-9.0,57.983193,10.0
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17,27.0,52.941176,-0.560224,-8.0,57.422969,10.0
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17,27.0,52.941176,-0.560224,-7.0,56.862745,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1291,2020-12-31,2019-12-31,C,US,old,forecast,12,56,16.0,58.928571,-0.535714,-4.0,61.071429,34.0
1292,2021-01-31,2019-12-31,C,US,old,forecast,13,56,16.0,58.928571,-0.535714,-3.0,60.535714,34.0
1293,2021-02-28,2019-12-31,C,US,old,forecast,14,56,16.0,58.928571,-0.535714,-2.0,60.000000,34.0
1294,2021-03-31,2019-12-31,C,US,old,forecast,15,56,16.0,58.928571,-0.535714,-1.0,59.464286,33.0


In [62]:
cols_cv = ['Date','Intake Month','Product','Country','Cohort_type','Type','months_since_acquisition','Cohort_size','forecast_retention_rates','forecast_net_customers']

df_submission_cv = pd.concat([df_forecast_train_cv[cols_cv],df_forecast_valid_cv[cols_cv],df_forecast_test_cv[cols_cv]],axis=0)
df_submission_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,forecast_retention_rates,forecast_net_customers
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17,59.103641,10.0
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17,58.543417,10.0
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17,57.983193,10.0
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17,57.422969,10.0
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17,56.862745,10.0
...,...,...,...,...,...,...,...,...,...,...
697,2021-04-30,2021-04-30,B,CA,new,forecast,0,0,100.000000,0.0
698,2021-04-30,2021-04-30,B,GB,new,forecast,0,0,100.000000,0.0
699,2021-04-30,2021-04-30,C,US,new,forecast,0,63,100.000000,63.0
700,2021-04-30,2021-04-30,C,CA,new,forecast,0,10,100.000000,10.0


In [116]:
df_submission_cv.to_csv('baseline_cv_save.csv',index=False)

In [64]:
df_submission_cv = df_submission_cv.merge(df[['Product','Country','Intake Month','Date','Net Customers']],how='left',on=['Product','Country','Intake Month','Date'])

df_submission_cv

Unnamed: 0,Date,Intake Month,Product,Country,Cohort_type,Type,months_since_acquisition,Cohort_size,forecast_retention_rates,forecast_net_customers,Net Customers
0,2020-05-31,2019-01-31,A,CA,old,forecast,16,17,59.103641,10.0,10
1,2020-06-30,2019-01-31,A,CA,old,forecast,17,17,58.543417,10.0,10
2,2020-07-31,2019-01-31,A,CA,old,forecast,18,17,57.983193,10.0,9
3,2020-08-31,2019-01-31,A,CA,old,forecast,19,17,57.422969,10.0,9
4,2020-09-30,2019-01-31,A,CA,old,forecast,20,17,56.862745,10.0,9
...,...,...,...,...,...,...,...,...,...,...,...
2425,2021-04-30,2021-04-30,B,CA,new,forecast,0,0,100.000000,0.0,0
2426,2021-04-30,2021-04-30,B,GB,new,forecast,0,0,100.000000,0.0,0
2427,2021-04-30,2021-04-30,C,US,new,forecast,0,63,100.000000,63.0,63
2428,2021-04-30,2021-04-30,C,CA,new,forecast,0,10,100.000000,10.0,10


In [65]:
y_true = df_submission_cv['Net Customers'].values
y_pred = df_submission_cv['forecast_net_customers'].values

mae = mean_absolute_error(y_true, y_pred)
rmse = sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"MAE: {np.round(mae,3)}, R2: {np.round(r2,3)}")

MAE: 2.317, R2: 0.978
