In [10]:
# W1 

import pandas as pd
import numpy as np

df = pd.DataFrame({
    "user": ["A","A","B","B","C","C","C","D","D"],
    "month": ["Jan","Feb","Jan","Feb","Jan","Feb","Mar","Jan","Feb"],
    "revenue": [100,120,80,90,200,210,50,60,70],
    "channel": ["web","web","app","app","web","app","web","web","app"]
})

In [None]:
# For each user, compute:

# total_revenue (all rows)
# web_revenue (only channel == "web")
# pct_web = web_revenue / total_revenue

# Return one row per user.

In [2]:
df.head()

Unnamed: 0,user,month,revenue,channel
0,A,Jan,100,web
1,A,Feb,120,web
2,B,Jan,80,app
3,B,Feb,90,app
4,C,Jan,200,web


In [11]:
df['web_revenue'] = df['revenue'].where(df['channel'] == 'web', 0)

df_agg = df.groupby('user').agg(
    total_revenue = ('revenue', 'sum'),
    web_revenue = ('web_revenue', 'sum')
    )

df_agg['pct_web'] = df_agg['web_revenue']/df_agg['total_revenue'] * 100

print(df_agg)

      total_revenue  web_revenue     pct_web
user                                        
A               220          220  100.000000
B               170            0    0.000000
C               460          250   54.347826
D               130           60   46.153846


In [12]:

# W2
df = pd.DataFrame({
    "order_id": range(1,11),
    "order_date": pd.to_datetime([
        "2025-01-03","2025-01-07","2025-01-15","2025-01-21","2025-02-02",
        "2025-02-05","2025-02-14","2025-02-20","2025-03-01","2025-03-10"
    ]),
    "amount": [50,60,40,80,90,70,55,65,100,120]
})

In [17]:
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['week'] = df['order_date'].dt.isocalendar().week
df['year_week'] = df['year'].astype(str) + '-W' + df['week'].astype(str).str.zfill(2)

df.head()

Unnamed: 0,order_id,order_date,amount,year,month,week,year_week
0,1,2025-01-03,50,2025,1,1,2025-W01
1,2,2025-01-07,60,2025,1,2,2025-W02
2,3,2025-01-15,40,2025,1,3,2025-W03
3,4,2025-01-21,80,2025,1,4,2025-W04
4,5,2025-02-02,90,2025,2,5,2025-W05


In [18]:
# W3 

df = pd.DataFrame({
    "A": [10,20,30,40,50],
    "B": [5,15,25,35,45],
    "C": [100,200,300,400,500]
})

In [None]:
# First pass effort
df_z = df.sub(df.sum(axis = 0), axis = 1)
df_z = df_z.div(df.std(axis = 0), axis = 1)

df_row_norm = df.div(df.sum(axis = 1), axis = 0)

df['df_z'] = df_z
df['df_row_norm'] = df_row_norm

print(df)

ValueError: Cannot set a DataFrame with multiple columns to the single column df_z

In [21]:
df.head()

Unnamed: 0,A,B,C
0,10,5,100
1,20,15,200
2,30,25,300
3,40,35,400
4,50,45,500


In [29]:
df_z = df.sub(df.mean(axis = 0), axis = 1)
df_z = df_z.div(df.std(axis = 0), axis = 1)

df_row_norm = df.div(df.sum(axis = 1), axis = 0)

print(df_z), print(df_row_norm)

          A         B         C
0 -1.264911 -1.264911 -1.264911
1 -0.632456 -0.632456 -0.632456
2  0.000000  0.000000  0.000000
3  0.632456  0.632456  0.632456
4  1.264911  1.264911  1.264911
          A         B         C
0  0.086957  0.043478  0.869565
1  0.085106  0.063830  0.851064
2  0.084507  0.070423  0.845070
3  0.084211  0.073684  0.842105
4  0.084034  0.075630  0.840336


(None, None)

In [54]:
# I 1 Windowed Ranking

import pandas as pd
import numpy as np

rng = pd.date_range("2025-01-01", periods=30, freq="D")

df = pd.DataFrame({
    "date": rng,
    "user": ["A","B","C"] * 10,
    "revenue": np.random.randint(10,100,30)
})

In [43]:
df['roll7_sum'] = (
    df.groupby('user')
      .rolling('7D', on='date')['revenue']
      .sum()
      .reset_index(level=0, drop=True)
)

df['roll7_rank'] = df.sort_values(['user', 'date']).groupby('date')['roll7_sum'].rank(ascending = False)

df

Unnamed: 0,date,user,revenue,roll7_sum,roll7_rank
0,2025-01-01,A,52,,
1,2025-01-02,B,37,,
2,2025-01-03,C,39,,
3,2025-01-04,A,22,,
4,2025-01-05,B,22,,
5,2025-01-06,C,93,,
6,2025-01-07,A,32,,
7,2025-01-08,B,18,,
8,2025-01-09,C,47,,
9,2025-01-10,A,29,,


In [44]:
s = (
    df.sort_values(['user','date'])
      .groupby('user')
      .rolling('7D', on='date')['revenue']
      .sum()
)

print(type(s))
print(s.index[:5])
print(df.index[:5])

<class 'pandas.core.series.Series'>
MultiIndex([('A', '2025-01-01'),
            ('A', '2025-01-04'),
            ('A', '2025-01-07'),
            ('A', '2025-01-10'),
            ('A', '2025-01-13')],
           names=['user', 'date'])
RangeIndex(start=0, stop=5, step=1)


In [55]:
df2 = df.sort_values(['user','date']).reset_index(drop=False).rename(columns={'index':'row_id'})

roll = (
    df2.groupby('user')
       .rolling('7D', on='date')['revenue']
       .sum()
       .reset_index()
       .rename(columns={'revenue': 'roll7_sum'})
)

# roll now has columns: user, date, roll7_sum
# merge back to df2 using the row_id we preserved
df2 = df2.merge(roll, on=['user','date'], how='left')

print(df2.columns)
# put back in original row order
df['roll7_sum'] = df2.sort_values('row_id')['roll7_sum'].to_numpy()

df['roll7_rank'] = df.groupby('date')['roll7_sum'].rank(ascending = False)

Index(['row_id', 'date', 'user', 'revenue', 'roll7_sum'], dtype='object')


In [57]:
df.sort_values('date').head()

Unnamed: 0,date,user,revenue,roll7_sum,roll7_rank
0,2025-01-01,A,97,97.0,1.0
1,2025-01-02,B,75,75.0,1.0
2,2025-01-03,C,97,97.0,1.0
3,2025-01-04,A,26,123.0,1.0
4,2025-01-05,B,97,172.0,1.0


In [60]:
import pandas as pd
import numpy as np

dates = pd.date_range("2025-01-01", periods=30, freq="D")
users = ["A","B","C"]

df = pd.DataFrame({
    "date": np.repeat(dates, len(users)),
    "user": users * len(dates),
    "revenue": np.random.randint(10, 100, len(dates) * len(users))
})

In [59]:
df2 = df.sort_values(['user','date']).reset_index(drop=False).rename(columns={'index':'row_id'})

roll = (
    df2.groupby('user')
       .rolling('7D', on='date')['revenue']
       .sum()
       .reset_index()
       .rename(columns={'revenue': 'roll7_sum'})
)

# roll now has columns: user, date, roll7_sum
# merge back to df2 using the row_id we preserved
df2 = df2.merge(roll, on=['user','date'], how='left')

print(df2.columns)
# put back in original row order
df['roll7_sum'] = df2.sort_values('row_id')['roll7_sum'].to_numpy()

df['roll7_rank'] = df.groupby('date')['roll7_sum'].rank(ascending = False)

df.sort_values('date').head()

Index(['row_id', 'date', 'user', 'revenue', 'roll7_sum'], dtype='object')


Unnamed: 0,date,user,revenue,roll7_sum,roll7_rank
0,2025-01-01,A,41,41.0,1.0
1,2025-01-01,B,11,11.0,3.0
2,2025-01-01,C,31,31.0,2.0
3,2025-01-02,A,91,132.0,1.0
4,2025-01-02,B,84,95.0,2.0


In [None]:
# I2: Hierarchical Aggregation
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "region": ["East","East","East","West","West","West","West","North","North"],
    "store":  ["S1","S1","S2","S1","S2","S2","S3","S1","S2"],
    "month":  ["Jan","Feb","Jan","Jan","Jan","Feb","Feb","Jan","Feb"],
    "sales":  [100,120,90,80,70,85,60,110,95],
    "customers":[20,25,18,15,14,17,12,22,19]
})

In [6]:
df.groupby(["region", "store"]).agg( 
                                    total_sales = ("sales", "sum"), 
                                    avg_customers =("customers", "mean"), 
                                    max_monthly_sales = ("sales", "max") ).groupby('region').apply(lambda x: x.assign(pct_of_region_sales = x['total_sales'] / x['total_sales'].sum() * 100))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_sales,avg_customers,max_monthly_sales,pct_of_region_sales
region,region,store,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East,East,S1,220,22.5,120,70.967742
East,East,S2,90,18.0,90,29.032258
North,North,S1,110,22.0,110,53.658537
North,North,S2,95,19.0,95,46.341463
West,West,S1,80,15.0,80,27.118644
West,West,S2,155,15.5,85,52.542373
West,West,S3,60,12.0,60,20.338983


In [7]:
# I2: Hierarchical Aggregation

import pandas as pd
import numpy as np

df = pd.DataFrame({
    "user": ["A","A","A","B","B","C","C","C","C"],
    "date": pd.date_range("2025-01-01", periods=9),
    "spend": [10,20,15,5,8,12,18,14,20]
})

In [8]:
df.head()

Unnamed: 0,user,date,spend
0,A,2025-01-01,10
1,A,2025-01-02,20
2,A,2025-01-03,15
3,B,2025-01-04,5
4,B,2025-01-05,8


In [None]:
# For each user:

# Compute a 7-day rolling sum of spend → call it roll7_sum

# Compute a rank of roll7_sum per day across all users, descending → call it roll7_rank

# Compute cumulative spend per user → call it cum_spend

# Use groupby, rolling, cumsum, rank
# Rolling windows are time-based (7D)

In [11]:
df = df.sort_values(['user', 'date'])
df['roll7_sum'] = df.groupby('user').rolling('7D', on = 'date', min_periods = 5)['spend'].sum().reset_index(level=0, drop=True)
df['roll7_rank'] = df.groupby('date')['roll7_sum'].rank(ascending = False)
df['cum_spend'] = df.groupby('user')['spend'].cumsum()

print(df)

  user       date  spend  roll7_sum  roll7_rank  cum_spend
0    A 2025-01-01     10        NaN         NaN         10
1    A 2025-01-02     20        NaN         NaN         30
2    A 2025-01-03     15        NaN         NaN         45
3    B 2025-01-04      5        NaN         NaN          5
4    B 2025-01-05      8        NaN         NaN         13
5    C 2025-01-06     12        NaN         NaN         12
6    C 2025-01-07     18        NaN         NaN         30
7    C 2025-01-08     14        NaN         NaN         44
8    C 2025-01-09     20        NaN         NaN         64


In [15]:
# I4: Time-Weighted Conversion / Funnel Metrics
import pandas as pd
import numpy as np

df = pd.DataFrame({
    "user": ["U1","U1","U1","U2","U2","U3","U3","U3"],
    "step": ["visit","signup","pay","visit","signup","visit","signup","pay"],
    "date": pd.to_datetime([
        "2025-01-01","2025-01-02","2025-01-05",
        "2025-01-01","2025-01-04",
        "2025-01-03","2025-01-06","2025-01-10"
    ])
})

In [13]:
df.head()

Unnamed: 0,user,step,date
0,U1,visit,2025-01-01
1,U1,signup,2025-01-02
2,U1,pay,2025-01-05
3,U2,visit,2025-01-01
4,U2,signup,2025-01-04


In [26]:

# Keep the earliest instance of each user-step combination
df = df.sort_values(['user', 'date']).groupby(['user', 'step']).first().reset_index()
df_pivot =df.pivot(index = 'user', columns = 'step', values = 'date')

visited = df_pivot['visit'].notna()
signup = visited & df_pivot['signup'].notna() & (df_pivot['signup'] > df_pivot['visit'])
paid = signup & df_pivot['pay'].notna() & (df_pivot['pay'] > df_pivot['signup'])
final  = pd.DataFrame({'step': ['visit', 'pay', 'signup'], 'n_users': [visited.sum(), signup.sum(), paid.sum()], 'conversion_rate': [np.nan, signup.sum()/visited.sum(), paid.sum()/signup.sum()]})

print(final)

     step  n_users  conversion_rate
0   visit        3              NaN
1     pay        3         1.000000
2  signup        2         0.666667


In [25]:
df = df.sort_values(['user', 'date']).groupby(['user', 'step']).first().reset_index()
df_pivot =df.pivot(index = 'user', columns = 'step', values = 'date')

visited = df_pivot['visit'].notna()

visited & df_pivot['signup'].notna() & (df_pivot['signup'] > df_pivot['visit'])

user
U1    True
U2    True
U3    True
dtype: bool

In [38]:
# Day 5 - Hard Cohort + Rolling Retention Hybrid

import pandas as pd
import numpy as np

df = pd.DataFrame({
    "user": [
        "U1","U1","U1","U1",
        "U2","U2","U2",
        "U3","U3",
        "U4","U4","U4"
    ],
    "event_date": pd.to_datetime([
        "2025-01-01","2025-01-03","2025-01-10","2025-01-20",
        "2025-01-02","2025-01-08","2025-01-18",
        "2025-01-05","2025-01-25",
        "2025-01-07","2025-01-09","2025-01-15"
    ]),
    "event_type": [
        "visit","visit","purchase","visit",
        "visit","visit","purchase",
        "visit","visit",
        "visit","purchase","visit"
    ]
})

In [None]:
# For users who joined in the same week, what % make a purchase within 14 days of each event window?

# 1. For each user, find first event date 
# cohort_week = first_event_date.to_period("W")

In [39]:
df['cohort_week'] = df.groupby('user')['event_date'].transform('min').dt.to_period('W')

In [50]:
merged = df.merge(df, on='user', suffixes=('_first', '_second'))
merged = merged[((merged['event_type_second'] == 'purchase')) & 
       (merged['event_date_second'] <= merged['event_date_first'] + pd.Timedelta(days=14))].groupby(['user', 'event_date_first', 'event_type_first']).first().reset_index()

merged['purchase_within_14d'] = True

In [51]:
merged.head()

Unnamed: 0,user,event_date_first,event_type_first,cohort_week_first,event_date_second,event_type_second,cohort_week_second,purchase_within_14d
0,U1,2025-01-01,visit,2024-12-30/2025-01-05,2025-01-10,purchase,2024-12-30/2025-01-05,True
1,U1,2025-01-03,visit,2024-12-30/2025-01-05,2025-01-10,purchase,2024-12-30/2025-01-05,True
2,U1,2025-01-10,purchase,2024-12-30/2025-01-05,2025-01-10,purchase,2024-12-30/2025-01-05,True
3,U1,2025-01-20,visit,2024-12-30/2025-01-05,2025-01-10,purchase,2024-12-30/2025-01-05,True
4,U2,2025-01-08,visit,2024-12-30/2025-01-05,2025-01-18,purchase,2024-12-30/2025-01-05,True


In [53]:
final = df.merge(merged[['user', 'event_date_first', 'event_type_first', 'purchase_within_14d']], left_on=['user', 'event_date', 'event_type'], right_on=['user', 'event_date_first', 'event_type_first'], how='left')

In [55]:
final.head()

Unnamed: 0,user,event_date,event_type,cohort_week,event_date_first,event_type_first,purchase_within_14d
0,U1,2025-01-01,visit,2024-12-30/2025-01-05,2025-01-01,visit,True
1,U1,2025-01-03,visit,2024-12-30/2025-01-05,2025-01-03,visit,True
2,U1,2025-01-10,purchase,2024-12-30/2025-01-05,2025-01-10,purchase,True
3,U1,2025-01-20,visit,2024-12-30/2025-01-05,2025-01-20,visit,True
4,U2,2025-01-02,visit,2024-12-30/2025-01-05,NaT,,


In [None]:
final['purchase_within_14d'] = final['purchase_within_14d'].fillna(0)
final.groupby('cohort_week')['purchase_within_14d'].mean()

cohort_week
2024-12-30/2025-01-05    0.666667
2025-01-06/2025-01-12         1.0
Freq: W-SUN, Name: purchase_within_14d, dtype: object

In [69]:
# Working code from AI
# Step 1: Cohort assignment
df['cohort_week'] = df.groupby('user')['event_date'].transform('min').dt.to_period('W')

# Step 2: Self-merge to find purchases within 14 days
merged = df.merge(df, on='user', suffixes=('_event', '_purchase'))

# Filter: purchase happened AFTER event, within 14 days
merged = merged[
    (merged['event_type_purchase'] == 'purchase') &
    (merged['event_date_purchase'] >= merged['event_date_event']) &
    (merged['event_date_purchase'] <= merged['event_date_event'] + pd.Timedelta(days=14))
]

# Mark events that had a purchase in the window
merged['purchase_within_14d'] = True
merged = merged[['user', 'event_date_event', 'purchase_within_14d']].drop_duplicates()

# Merge back to original
final = df.merge(
    merged, 
    left_on=['user', 'event_date'], 
    right_on=['user', 'event_date_event'], 
    how='left'
)

final['purchase_within_14d'] = final['purchase_within_14d'].astype('boolean').fillna(False)

# Step 3: Aggregate by cohort_week and event_week
final['event_week'] = final['event_date'].dt.to_period('W')

result = final.groupby(['cohort_week', 'event_week']).agg(
     n_users=('user', 'nunique'),
    retention_rate=('purchase_within_14d', 'mean')
 ).reset_index()

In [70]:
result.head()

Unnamed: 0,cohort_week,event_week,n_users,retention_rate
0,2024-12-30/2025-01-05,2024-12-30/2025-01-05,3,0.5
1,2024-12-30/2025-01-05,2025-01-06/2025-01-12,2,1.0
2,2024-12-30/2025-01-05,2025-01-13/2025-01-19,1,1.0
3,2024-12-30/2025-01-05,2025-01-20/2025-01-26,2,0.0
4,2025-01-06/2025-01-12,2025-01-06/2025-01-12,1,1.0
