In [3]:
from datetime import datetime
import pandas as pd
import numpy as np

Initially we will work on cohort, by assigning each customer to cohort

In [22]:
orders = pd.read_csv('orders.csv')[['customer_id','created_at']]

In [23]:
orders['created_at'] = pd.to_datetime(orders['created_at'])

In [24]:
orders.head()

Unnamed: 0,customer_id,created_at
0,1,2013-04-07 12:00:00
1,2,2013-04-04 12:00:00
2,4,2013-04-06 12:00:00
3,3,2013-04-08 12:00:00
4,10,2013-04-04 12:00:00


In [25]:
orders['created_at_month_year']= orders['created_at'].map(lambda x: x.strftime('%Y-%m'))

DF is 1) sorted by MM/YY, 2) then grouped by CustomerID, then 3) only DD/MM is left and 4) only first values of grouped IDs left.

In [36]:
cohort = orders.sort_values(by='created_at_month_year').groupby('customer_id')['created_at_month_year'].first()

Important to do the below immediately after above, as df, when printed, get lost. Below is converting Pandas series which resulted from above to DF. 
Now we have customer ID and cohort he belongs to.

In [42]:
cohort = cohort.to_frame(name='cohort')

In [43]:
type(cohort)

pandas.core.frame.DataFrame

Now need to recreate a table with all orders (3.577 MM) but with cohorts. 
Will merge cohorts with our orders table, left join

In [44]:
df_cohort = pd.merge(orders[['customer_id','created_at_month_year']],cohort, right_index=True, left_on='customer_id')

Need to check if the lenght of left table equals the length of right table

In [46]:
assert len(orders) == len(df_cohort)

Now we have a table with customer_id, their cohort, their_purchase times. We need to summarize it, by how many of cohorts were buying during each month.

In [47]:
df_cohort.head()

Unnamed: 0,customer_id,created_at_month_year,cohort
0,1,2013-04,2013-04
287846,1,2013-10,2013-04
1646408,1,2015-03,2013-04
1871621,1,2015-04,2013-04
1,2,2013-04,2013-04


Counting number of customers in each cohort
Need to use Nunique instead of count as it counts each ID only once. Otherwise, it would count it several times.

In [56]:
df_cohort.groupby('cohort')['customer_id'].nunique().head()

cohort
2013-04    28874
2013-05    30851
2013-06    29953
2013-07    31013
2013-08    30851
Name: customer_id, dtype: int64

In [52]:
df_cohort.groupby('cohort').agg({'customer_id': pd.Series.nunique}).head()

Unnamed: 0_level_0,customer_id
cohort,Unnamed: 1_level_1
2013-04,28874
2013-05,30851
2013-06,29953
2013-07,31013
2013-08,30851


In [61]:
x =df_cohort.groupby('created_at_month_year')['cohort'].count()

In [116]:
cohort_pivot = pd.pivot_table(df_cohort, values = 'customer_id', 
               index = 'created_at_month_year', columns = 'cohort', aggfunc = 'count', fill_value = 0)

In [120]:
cohort_pivot

cohort,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,2013-11,2013-12,2014-01,...,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04
created_at_month_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-04,28874,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-05,3406,30978,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-06,3475,3601,30057,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-07,3488,3873,3628,31113,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-08,3604,3820,3730,3743,30960,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-09,3545,3762,3608,3771,3562,29988,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-10,3580,3876,3652,3901,3786,3539,31108,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-11,3502,3720,3568,3799,3755,3532,3546,30065,0,0,...,0,0,0,0,0,0,0,0,0,0
2013-12,3586,3889,3705,3757,3866,3663,3774,3540,31197,0,...,0,0,0,0,0,0,0,0,0,0
2014-01,3729,3811,3753,3895,3774,3619,3885,3802,3694,31323,...,0,0,0,0,0,0,0,0,0,0


Creating a sample file to work in Excel

In [68]:
sample = df_cohort.query('customer_id < 1000')

In [70]:
sample.to_csv('sample_cohort.csv')

#### Calculation percent values for each cell

In [114]:
def percent(z):
    n = list(z).index(filter(lambda z: z!=0, z)[0]) # to select first non-zero element
    y = []
    for i in z:
        y.append(float(i)/z[n])
    return pd.Series(y)

Dataframe created but the index values were lost

In [121]:
cohort_pivot_perecent = cohort_pivot.apply(percent)

Recreating index values for new percent based table

In [123]:
names = list(cohort_pivot.columns)

In [127]:
new_index = {}
for x,i in enumerate(names,0):
    new_index[x]=i

In [128]:
cohort_pivot_perecent = cohort_pivot_perecent.rename(index = new_index)

In [129]:
cohort_pivot_perecent

cohort,2013-04,2013-05,2013-06,2013-07,2013-08,2013-09,2013-10,2013-11,2013-12,2014-01,...,2015-07,2015-08,2015-09,2015-10,2015-11,2015-12,2016-01,2016-02,2016-03,2016-04
2013-04,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-04,0.117961,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-05,0.12035,0.116244,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-06,0.120801,0.125024,0.120704,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-07,0.124818,0.123313,0.124098,0.120303,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-08,0.122775,0.121441,0.120039,0.121203,0.115052,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-09,0.123987,0.125121,0.121502,0.125382,0.122287,0.118014,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-10,0.121286,0.120085,0.118708,0.122103,0.121286,0.11778,0.11399,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-11,0.124195,0.125541,0.123266,0.120753,0.124871,0.122149,0.121319,0.117745,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-12,0.129147,0.123023,0.124863,0.125189,0.121899,0.120682,0.124887,0.126459,0.118409,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [130]:
cohort_pivot_perecent.to_csv('Cohort_matrix.csv')