In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('out/users_small.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   user_id     100000 non-null  int64
 1   persona_id  100000 non-null  int64
 2   cohort_id   100000 non-null  int64
dtypes: int64(3)
memory usage: 2.3 MB


In [4]:
df.head()

Unnamed: 0,user_id,persona_id,cohort_id
0,0,454,11440
1,1,369,32795
2,2,104,18034
3,3,293,103
4,4,210,27302


In [5]:
# How many unique users?

df['user_id'].nunique()

100000

In [6]:
# How many unique personas?

df['persona_id'].nunique()

500

In [7]:
# How many unique cohorts?

df['cohort_id'].nunique()

25569

In [8]:
# Average number of users in each cohort?

df['user_id'].nunique() / df['cohort_id'].nunique()

3.910985959560405

In [9]:
# Number of users in each Cohort: mean, min, max, etc.
cohort_counts = df['cohort_id'].value_counts()
cohort_counts.describe()

count    25569.000000
mean         3.910986
std          4.679561
min          1.000000
25%          1.000000
50%          2.000000
75%          5.000000
max        113.000000
Name: cohort_id, dtype: float64

In [10]:
# Further stats about each cohort

rows = []

for cohortID in df['cohort_id'].unique():
    
    _tmp = df[df['cohort_id'] == cohortID]
    
    _persona_counts = _tmp['persona_id'].value_counts()
    
    _num_users = _tmp.shape[0]
    
    row = {
        'cohort_id': cohortID,
        'NumUsers': _num_users,
        'NumPersonas': _tmp['persona_id'].nunique(),
        'PersonaCount_Max': _persona_counts.max(),
        'PersonaCount_Mean': _persona_counts.mean(),
        'PersonaCount_Min': _persona_counts.min(),
        'PersonaCount_EQ1': sum(_persona_counts == 1),
        'PersonaCount_GT1': sum(_persona_counts > 1),
        'PersonaCount_GT5': sum(_persona_counts > 5),
    }
    rows.append(row)

p = pd.DataFrame(rows, columns=list(row.keys()))

p.sort_values('PersonaCount_Mean', ascending=False)

Unnamed: 0,cohort_id,NumUsers,NumPersonas,PersonaCount_Max,PersonaCount_Mean,PersonaCount_Min,PersonaCount_EQ1,PersonaCount_GT1,PersonaCount_GT5
751,31955,86,19,9,4.526316,1,4,15,10
11729,6273,4,1,4,4.000000,4,0,1,0
16804,22604,3,1,3,3.000000,3,0,1,0
10467,22480,3,1,3,3.000000,3,0,1,0
5828,18230,3,1,3,3.000000,3,0,1,0
...,...,...,...,...,...,...,...,...,...
10045,33682,1,1,1,1.000000,1,1,0,0
10044,26156,6,6,1,1.000000,1,6,0,0
10043,31432,4,4,1,1.000000,1,4,0,0
10042,20310,4,4,1,1.000000,1,4,0,0


In [11]:
# Further stats about each Persona

rows = []

for personaID in df['persona_id'].unique():
    
    _tmp = df[df['persona_id'] == personaID]
    
    _cohort_counts = _tmp['cohort_id'].value_counts()
    
    _num_users = _tmp.shape[0]
    
    row = {
        'persona_id': personaID,
        'NumUsers': _num_users,
        'NumCohorts': _tmp['cohort_id'].nunique(),
        'CohortCount_Max': _cohort_counts.max(),
        'CohortCount_Mean': _cohort_counts.mean(),
        'CohortCount_EQ1': sum(_cohort_counts == 1),
        'CohortCount_GT1': sum(_cohort_counts > 1),
        'CohortCount_GT5': sum(_cohort_counts > 5),
    }
    rows.append(row)

p = pd.DataFrame(rows, columns=list(row.keys()))

p.sort_values('NumUsers')

Unnamed: 0,persona_id,NumUsers,NumCohorts,CohortCount_Max,CohortCount_Mean,CohortCount_EQ1,CohortCount_GT1,CohortCount_GT5
146,10,159,153,2,1.039216,147,6,0
426,12,162,125,7,1.296000,104,21,2
343,236,163,143,4,1.139860,128,15,0
178,263,164,159,2,1.031447,154,5,0
486,348,164,148,6,1.108108,138,10,1
...,...,...,...,...,...,...,...,...
323,298,233,226,2,1.030973,219,7,0
303,436,233,219,3,1.063927,207,12,0
314,166,235,219,2,1.073059,203,16,0
233,179,243,212,5,1.146226,190,22,0
