In [1]:
# importing pandas, numpy, and regex

import pandas as pd
import numpy as np
import re

In [2]:
# importing data

df = pd.read_csv(r'C:\Users\CharlesYi\Jupyter Notebook\Alteryx Challenges\Challenge 17_Month-over-Month Retention Rate\data.csv')

df.head()

Unnamed: 0,RecordID,Open Date,Close Date
0,1,"April 03, 2013","May 06, 2013"
1,2,"April 14, 2013",
2,3,"May 03, 2013","July 18, 2013"
3,4,"May 24, 2013","June 12, 2013"
4,5,"June 13, 2013","July 10, 2013"


In [6]:
# creating date range between May - August 2013

date_range = pd.date_range(start = '2013-05-01', end = '2013-08-01', freq = 'MS')

In [7]:
date_range

DatetimeIndex(['2013-05-01', '2013-06-01', '2013-07-01', '2013-08-01'], dtype='datetime64[ns]', freq='MS')

In [16]:
# formatting open date

df['Open Date'] = pd.to_datetime(df['Open Date'], format = '%B %d, %Y')

In [18]:
# formatting close date

df['Close Date'] = pd.to_datetime(df['Close Date'], format = '%B %d, %Y')

In [19]:
# checking results

df.head()

Unnamed: 0,RecordID,Open Date,Close Date
0,1,2013-04-03,2013-05-06
1,2,2013-04-14,NaT
2,3,2013-05-03,2013-07-18
3,4,2013-05-24,2013-06-12
4,5,2013-06-13,2013-07-10


In [84]:
# creating function to determine number open and closed

def open_close (row):
    
    open_close_list = []
    
    # calculating closed in month
    for x in date_range:
        
        # setting variables
        month_retention = x.strftime('%B')
        close_month = row['Close Date'].strftime('%B') if pd.notna(row['Close Date']) else None
        
        if month_retention == close_month:
            open_close_list.append({"month_retention" : x, "number_closed" : 1})
        else:
            open_close_list.append({"month_retention" : x, "number_closed" : 0})
        
    # calculating open in month    
    for x in date_range:
        
        # setting variables
        month_rentention = x.strftime('%B')
        previous_month = x - pd.offsets.MonthEnd(1)
        previous_24_month = x - pd.DateOffset(years = 2)
        open_date = row['Open Date']
        close_date = row['Close Date']
    
        if ( close_date >= previous_month or pd.isna(close_date) ) and open_date >= previous_24_month and open_date < x:
            open_close_list.append({"month_retention" : x, "number_open" : 1})
        else:
            open_close_list.append({"month_retention" : x, "number_open" : 0})
            
    return open_close_list

In [85]:
# applying function

retention = df.apply(open_close, axis = 1)

retention.head()

0    [{'month_retention': 2013-05-01 00:00:00, 'num...
1    [{'month_retention': 2013-05-01 00:00:00, 'num...
2    [{'month_retention': 2013-05-01 00:00:00, 'num...
3    [{'month_retention': 2013-05-01 00:00:00, 'num...
4    [{'month_retention': 2013-05-01 00:00:00, 'num...
dtype: object

In [86]:
# exploding

retention_explode = retention.explode()

retention_explode

0    {'month_retention': 2013-05-01 00:00:00, 'numb...
0    {'month_retention': 2013-06-01 00:00:00, 'numb...
0    {'month_retention': 2013-07-01 00:00:00, 'numb...
0    {'month_retention': 2013-08-01 00:00:00, 'numb...
0    {'month_retention': 2013-05-01 00:00:00, 'numb...
                           ...                        
9    {'month_retention': 2013-08-01 00:00:00, 'numb...
9    {'month_retention': 2013-05-01 00:00:00, 'numb...
9    {'month_retention': 2013-06-01 00:00:00, 'numb...
9    {'month_retention': 2013-07-01 00:00:00, 'numb...
9    {'month_retention': 2013-08-01 00:00:00, 'numb...
Length: 80, dtype: object

In [87]:
# parsing

retention_parsed = pd.json_normalize(retention_explode)

retention_parsed.head()

Unnamed: 0,month_retention,number_closed,number_open
0,2013-05-01,1.0,
1,2013-06-01,0.0,
2,2013-07-01,0.0,
3,2013-08-01,0.0,
4,2013-05-01,,1.0


In [90]:
# aggregating 

ret_agg = retention_parsed.groupby('month_retention').agg(Open_Month = ('number_open', 'sum'), Close_Month = ('number_closed', 'sum'))

ret_agg

Unnamed: 0_level_0,Open_Month,Close_Month
month_retention,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-05-01,2.0,1.0
2013-06-01,3.0,1.0
2013-07-01,4.0,2.0
2013-08-01,5.0,1.0


In [95]:
# finding percentage closed/open

ret_agg.assign(percentage = round((ret_agg['Close_Month'] / ret_agg['Open_Month'])*100, 2))

Unnamed: 0_level_0,Open_Month,Close_Month,percentage
month_retention,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-05-01,2.0,1.0,50.0
2013-06-01,3.0,1.0,33.33
2013-07-01,4.0,2.0,50.0
2013-08-01,5.0,1.0,20.0
