In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [5]:
def load_data():
    """Load the cleaned data and convert event_time to datetime"""
    df = pd.read_csv('data/events_cleaned.csv')
    df['event_time'] = pd.to_datetime(df['event_time'])
    return df

In [6]:
df = load_data()
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2020-09-24 11:57:06+00:00,view,1996170,2144415922528450000,electronics.telephone,unknown_brand,31.9,1515915625519380000,LJuJVLEjPT
1,2020-09-24 11:57:26+00:00,view,139905,2144415922528450000,computers.components.cooler,zalman,17.16,1515915625519380000,tdicluNnRY
2,2020-09-24 11:57:27+00:00,view,215454,2144415922528450000,computers.components.videocards,unknown_brand,9.81,1515915625513230000,4TMArHtXQy
3,2020-09-24 11:57:33+00:00,view,635807,2144415922528450000,computers.peripherals.printer,pantum,113.81,1515915625519010000,aGFYrNgC08
4,2020-09-24 11:57:36+00:00,view,3658723,2144415922528450000,computers.components.videocards,cameronsino,15.87,1515915625510740000,aa4mmk0kwQ


In [11]:
df.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,885129.0,885129.0,885129.0,885129.0
mean,1906621.0,2.144416e+18,146.328713,1.515916e+18
std,1458708.0,2048.001,296.807683,35541670.0
min,102.0,2.144416e+18,0.22,1.515916e+18
25%,698803.0,2.144416e+18,26.46,1.515916e+18
50%,1452883.0,2.144416e+18,65.71,1.515916e+18
75%,3721194.0,2.144416e+18,190.49,1.515916e+18
max,4183880.0,2.144416e+18,64771.06,1.515916e+18


In [12]:
def calculate_reference_date(df):
    """Calculate reference date as the last event date in the dataset"""
    return df['event_time'].max()

In [14]:
ref_date = calculate_reference_date(df)
ref_date

Timestamp('2021-02-28 23:59:09+0000', tz='UTC')

In [16]:
def create_recency_features(df, ref_date):
    """
    Create recency-based features
    These features measure how recently users have interacted with the platform
    """
    # Get user session data
    session_data = df.groupby(['user_id', 'user_session'])['event_time'].agg(['min', 'max']).reset_index()
    session_data['session_date'] = session_data['min'].dt.date
    
    recency_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Session Recency - Days between last user session and reference date
    last_sessions = session_data.groupby('user_id')['max'].max()
    recency_features['ses_rec'] = (ref_date - last_sessions).dt.total_seconds() / (24 * 3600)
    
    # Calculate time between consecutive sessions
    session_data_sorted = session_data.sort_values(['user_id', 'min'])
    session_data_sorted['prev_session'] = session_data_sorted.groupby('user_id')['min'].shift(1)
    session_data_sorted['days_between'] = (session_data_sorted['min'] - session_data_sorted['prev_session']).dt.total_seconds() / (24 * 3600)
    
    # Average Session Recency - Average days between consecutive sessions
    recency_features['ses_rec_avg'] = session_data_sorted.groupby('user_id')['days_between'].mean()
    
    # Session Recency Standard Deviation
    recency_features['ses_rec_sd'] = session_data_sorted.groupby('user_id')['days_between'].std()
    
    # Session Recency Coefficient of Variation (%)
    recency_features['ses_rec_cv'] = (recency_features['ses_rec_sd'] / recency_features['ses_rec_avg']) * 100
    
    # User Maturity - Days since user's first session
    first_sessions = session_data.groupby('user_id')['min'].min()
    recency_features['user_rec'] = (ref_date - first_sessions).dt.total_seconds() / (24 * 3600)
    
    return recency_features

In [17]:
recency_features = create_recency_features(df, ref_date)
print("Recency features:", recency_features.columns.tolist())

Recency features: ['ses_rec', 'ses_rec_avg', 'ses_rec_sd', 'ses_rec_cv', 'user_rec']


In [20]:
recency_features

Unnamed: 0,ses_rec,ses_rec_avg,ses_rec_sd,ses_rec_cv,user_rec
1515915625519380000,44.131042,2.694893,8.004473,297.023819,157.501424
1515915625513230000,148.475637,4.510764,2.074016,45.979268,157.501181
1515915625519010000,85.436088,24.021674,19.545232,81.364985,157.501111
1515915625510740000,157.501076,,,,157.501076
1515915625519320000,29.892025,6.380198,11.645592,182.527130,157.500509
...,...,...,...,...,...
1515915625610980000,0.111019,0.000778,0.002715,348.901265,0.181782
1515915625610990000,0.013669,0.000394,0.000584,147.985436,0.156250
1515915625611000000,0.089572,0.000546,0.000612,112.182623,0.127222
1515915625611010000,0.009387,0.000876,0.002275,259.542344,0.087384


In [21]:
def create_frequency_features(df):
    """
    Create frequency-based features
    These features measure how often users interact with the platform
    """
    frequency_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Session Count - Total number of sessions
    frequency_features['ses_n'] = df.groupby('user_id')['user_session'].nunique()
    
    # Relative Session Frequency - Sessions per day since first activity
    user_first_activity = df.groupby('user_id')['event_time'].min()
    user_last_activity = df.groupby('user_id')['event_time'].max()
    user_activity_days = (user_last_activity - user_first_activity).dt.total_seconds() / (24 * 3600)
    frequency_features['ses_n_r'] = frequency_features['ses_n'] / np.maximum(user_activity_days, 1)
    
    # Event counts
    event_counts = df.groupby('user_id')['event_type'].value_counts().unstack(fill_value=0)
    frequency_features['int_n'] = df.groupby('user_id').size()  # Total interactions
    frequency_features['view_count'] = event_counts.get('view', 0)
    frequency_features['cart_count'] = event_counts.get('cart', 0)
    frequency_features['purchase_count'] = event_counts.get('purchase', 0)
    
    # Interaction Rate - Average interactions per session
    frequency_features['int_n_r'] = frequency_features['int_n'] / frequency_features['ses_n']
    
    # Transaction features
    frequency_features['tran_n'] = frequency_features['purchase_count']  # Total transactions
    frequency_features['tran_n_r'] = frequency_features['tran_n'] / frequency_features['ses_n']  # Transactions per session
    
    return frequency_features

In [22]:
frequency_features = create_frequency_features(df)
print("Frequency features:", frequency_features.columns.tolist())

Frequency features: ['ses_n', 'ses_n_r', 'int_n', 'view_count', 'cart_count', 'purchase_count', 'int_n_r', 'tran_n', 'tran_n_r']


In [23]:
frequency_features

Unnamed: 0,ses_n,ses_n_r,int_n,view_count,cart_count,purchase_count,int_n_r,tran_n,tran_n_r
1515915625519380000,36,0.317543,56,55,1,0,1.555556,0,0.000000
1515915625513230000,3,0.332390,4,3,0,1,1.333333,1,0.333333
1515915625519010000,4,0.055505,7,7,0,0,1.750000,0,0.000000
1515915625510740000,1,1.000000,1,1,0,0,1.000000,0,0.000000
1515915625519320000,21,0.164566,36,34,2,0,1.714286,0,0.000000
...,...,...,...,...,...,...,...,...,...
1515915625610980000,66,66.000000,102,93,6,3,1.545455,3,0.045455
1515915625610990000,85,85.000000,131,110,12,9,1.541176,9,0.105882
1515915625611000000,70,70.000000,97,91,5,1,1.385714,1,0.014286
1515915625611010000,90,90.000000,160,147,12,1,1.777778,1,0.011111


In [24]:
def create_monetary_features(df):
    """
    Create monetary-based features
    These features measure user spending patterns
    """
    purchase_data = df[df['event_type'] == 'purchase'].copy()
    monetary_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Total Revenue - Total spending by user
    monetary_features['rev_sum'] = purchase_data.groupby('user_id')['price'].sum()
    
    # Average Purchase Value
    monetary_features['rev_per_purchase'] = purchase_data.groupby('user_id')['price'].mean()
    
    # Revenue per Session
    sessions_with_purchase = purchase_data.groupby('user_id')['user_session'].nunique()
    monetary_features['rev_sum_r'] = monetary_features['rev_sum'] / sessions_with_purchase
    
    # High Spender Flag - Above average spending
    avg_spending = monetary_features['rev_sum'].mean()
    monetary_features['major_spend_r'] = (monetary_features['rev_sum'] > avg_spending).astype(float)
    
    return monetary_features.fillna(0)

In [25]:
monetary_features = create_monetary_features(df)
print("Monetary features:", monetary_features.columns.tolist())

Monetary features: ['rev_sum', 'rev_per_purchase', 'rev_sum_r', 'major_spend_r']


In [26]:
monetary_features

Unnamed: 0,rev_sum,rev_per_purchase,rev_sum_r,major_spend_r
1515915625519380000,0.00,0.000000,0.000,0.0
1515915625513230000,140.63,140.630000,140.630,0.0
1515915625519010000,0.00,0.000000,0.000,0.0
1515915625510740000,0.00,0.000000,0.000,0.0
1515915625519320000,0.00,0.000000,0.000,0.0
...,...,...,...,...
1515915625610980000,1224.15,408.050000,408.050,1.0
1515915625610990000,2953.19,328.132222,590.638,1.0
1515915625611000000,38.83,38.830000,38.830,0.0
1515915625611010000,15.08,15.080000,15.080,0.0


In [27]:
def create_category_features(df):
    """
    Create category and item-based features
    These features measure user interaction with different products and categories
    """
    category_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Count unique categories and products per user
    category_features['int_cat_n'] = df.groupby('user_id')['category_code'].nunique()
    category_features['int_itm_n'] = df.groupby('user_id')['product_id'].nunique()
    
    # Calculate diversity metrics
    session_counts = df.groupby('user_id')['user_session'].nunique()
    category_features['int_cat_n_avg'] = category_features['int_cat_n'] / session_counts
    category_features['int_itm_n_avg'] = category_features['int_itm_n'] / session_counts
    
    return category_features

In [28]:
category_features = create_category_features(df)
print("Category features:", category_features.columns.tolist())

Category features: ['int_cat_n', 'int_itm_n', 'int_cat_n_avg', 'int_itm_n_avg']


In [29]:
category_features

Unnamed: 0,int_cat_n,int_itm_n,int_cat_n_avg,int_itm_n_avg
1515915625519380000,17,44,0.472222,1.222222
1515915625513230000,2,3,0.666667,1.000000
1515915625519010000,3,7,0.750000,1.750000
1515915625510740000,1,1,1.000000,1.000000
1515915625519320000,10,32,0.476190,1.523810
...,...,...,...,...
1515915625610980000,20,66,0.303030,1.000000
1515915625610990000,27,84,0.317647,0.988235
1515915625611000000,21,73,0.300000,1.042857
1515915625611010000,21,99,0.233333,1.100000


In [31]:
def create_datetime_features(df):
    """
    Create date and time-based features
    These features capture temporal patterns in user behavior
    """
    datetime_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Calculate month and hour features
    df['month'] = df['event_time'].dt.month
    df['hour'] = df['event_time'].dt.hour
    df['is_weekend'] = df['event_time'].dt.dayofweek.isin([5, 6]).astype(int)
    
    # Average and standard deviation for month and hour
    datetime_features['ses_mo_avg'] = df.groupby('user_id')['month'].mean()
    datetime_features['ses_mo_sd'] = df.groupby('user_id')['month'].std()
    datetime_features['ses_hr_avg'] = df.groupby('user_id')['hour'].mean()
    datetime_features['ses_hr_sd'] = df.groupby('user_id')['hour'].std()
    
    # Weekend ratio
    datetime_features['ses_wknd_r'] = df.groupby('user_id')['is_weekend'].mean()
    
    return datetime_features

In [32]:
datetime_features = create_datetime_features(df)
print("DateTime features:", datetime_features.columns.tolist())

DateTime features: ['ses_mo_avg', 'ses_mo_sd', 'ses_hr_avg', 'ses_hr_sd', 'ses_wknd_r']


In [33]:
datetime_features

Unnamed: 0,ses_mo_avg,ses_mo_sd,ses_hr_avg,ses_hr_sd,ses_wknd_r
1515915625519380000,9.035714,1.220496,13.125000,2.472117,0.071429
1515915625513230000,9.500000,0.577350,12.000000,0.816497,0.750000
1515915625519010000,9.714286,1.253566,13.285714,2.429972,0.714286
1515915625510740000,9.000000,,11.000000,,0.000000
1515915625519320000,9.555556,2.285913,12.361111,5.188739,0.194444
...,...,...,...,...,...
1515915625610980000,2.000000,0.000000,19.509804,0.540354,1.000000
1515915625610990000,2.000000,0.000000,20.099237,0.369094,1.000000
1515915625611000000,2.000000,0.000000,20.845361,0.363439,1.000000
1515915625611010000,2.000000,0.000000,22.218750,0.557080,1.000000


In [34]:
def create_other_features(df):
    """
    Create other miscellaneous features
    These features capture additional behavioral patterns
    """
    other_features = pd.DataFrame(index=df['user_id'].unique())
    
    # Average session length (in minutes)
    session_lengths = df.groupby(['user_id', 'user_session']).agg(
        session_length=('event_time', lambda x: (x.max() - x.min()).total_seconds() / 60)
    )
    other_features['ses_len_avg'] = session_lengths.groupby('user_id')['session_length'].mean()
    
    # Time between interactions (in minutes)
    df_sorted = df.sort_values(['user_id', 'event_time'])
    df_sorted['next_event'] = df_sorted.groupby('user_id')['event_time'].shift(-1)
    df_sorted['time_gap'] = (df_sorted['next_event'] - df_sorted['event_time']).dt.total_seconds() / 60
    other_features['time_to_int'] = df_sorted.groupby('user_id')['time_gap'].mean()
    
    # Time between purchases (in days)
    purchases = df[df['event_type'] == 'purchase'].sort_values(['user_id', 'event_time'])
    purchases['next_purchase'] = purchases.groupby('user_id')['event_time'].shift(-1)
    purchases['purchase_gap'] = (purchases['next_purchase'] - purchases['event_time']).dt.total_seconds() / (24 * 3600)
    other_features['time_to_tran'] = purchases.groupby('user_id')['purchase_gap'].mean()
    
    return other_features

In [35]:
other_features = create_other_features(df)
print("Other features:", other_features.columns.tolist())

Other features: ['ses_len_avg', 'time_to_int', 'time_to_tran']


In [36]:
other_features

Unnamed: 0,ses_len_avg,time_to_int,time_to_tran
1515915625519380000,884.188426,2968.242727,
1515915625513230000,1.927778,4332.261111,
1515915625519010000,3.766667,17295.605556,
1515915625510740000,0.000000,,
1515915625519320000,18.053175,5250.177619,
...,...,...,...
1515915625610980000,3.524242,1.008911,0.004485
1515915625610990000,3.149020,1.579359,0.005307
1515915625611000000,0.864048,0.564757,
1515915625611010000,2.036111,0.706394,


In [37]:
# Combine all features
all_features = pd.concat([
    recency_features,
    frequency_features,
    monetary_features,
    category_features,
    datetime_features,
    other_features
], axis=1)

# Fill NaN values appropriately
all_features = all_features.fillna(0)

print("\nFinal feature matrix shape:", all_features.shape)


Final feature matrix shape: (13353, 30)


In [39]:
all_features.to_csv('data/events_features.csv')

In [51]:
df_features = pd.read_csv('data/events_features.csv')
df_features

Unnamed: 0.1,Unnamed: 0,ses_rec,ses_rec_avg,ses_rec_sd,ses_rec_cv,user_rec,ses_n,ses_n_r,int_n,view_count,...,int_cat_n_avg,int_itm_n_avg,ses_mo_avg,ses_mo_sd,ses_hr_avg,ses_hr_sd,ses_wknd_r,ses_len_avg,time_to_int,time_to_tran
0,1515915625519380000,44.131042,2.694893,8.004473,297.023819,157.501424,36,0.317543,56,55,...,0.472222,1.222222,9.035714,1.220496,13.125000,2.472117,0.071429,884.188426,2968.242727,0.000000
1,1515915625513230000,148.475637,4.510764,2.074016,45.979268,157.501181,3,0.332390,4,3,...,0.666667,1.000000,9.500000,0.577350,12.000000,0.816497,0.750000,1.927778,4332.261111,0.000000
2,1515915625519010000,85.436088,24.021674,19.545232,81.364985,157.501111,4,0.055505,7,7,...,0.750000,1.750000,9.714286,1.253566,13.285714,2.429972,0.714286,3.766667,17295.605556,0.000000
3,1515915625510740000,157.501076,0.000000,0.000000,0.000000,157.501076,1,1.000000,1,1,...,1.000000,1.000000,9.000000,0.000000,11.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1515915625519320000,29.892025,6.380198,11.645592,182.527130,157.500509,21,0.164566,36,34,...,0.476190,1.523810,9.555556,2.285913,12.361111,5.188739,0.194444,18.053175,5250.177619,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13348,1515915625610980000,0.111019,0.000778,0.002715,348.901265,0.181782,66,66.000000,102,93,...,0.303030,1.000000,2.000000,0.000000,19.509804,0.540354,1.000000,3.524242,1.008911,0.004485
13349,1515915625610990000,0.013669,0.000394,0.000584,147.985436,0.156250,85,85.000000,131,110,...,0.317647,0.988235,2.000000,0.000000,20.099237,0.369094,1.000000,3.149020,1.579359,0.005307
13350,1515915625611000000,0.089572,0.000546,0.000612,112.182623,0.127222,70,70.000000,97,91,...,0.300000,1.042857,2.000000,0.000000,20.845361,0.363439,1.000000,0.864048,0.564757,0.000000
13351,1515915625611010000,0.009387,0.000876,0.002275,259.542344,0.087384,90,90.000000,160,147,...,0.233333,1.100000,2.000000,0.000000,22.218750,0.557080,1.000000,2.036111,0.706394,0.000000


In [52]:
df_features.columns

Index(['Unnamed: 0', 'ses_rec', 'ses_rec_avg', 'ses_rec_sd', 'ses_rec_cv',
       'user_rec', 'ses_n', 'ses_n_r', 'int_n', 'view_count', 'cart_count',
       'purchase_count', 'int_n_r', 'tran_n', 'tran_n_r', 'rev_sum',
       'rev_per_purchase', 'rev_sum_r', 'major_spend_r', 'int_cat_n',
       'int_itm_n', 'int_cat_n_avg', 'int_itm_n_avg', 'ses_mo_avg',
       'ses_mo_sd', 'ses_hr_avg', 'ses_hr_sd', 'ses_wknd_r', 'ses_len_avg',
       'time_to_int', 'time_to_tran'],
      dtype='object')

### Adding more features

In [45]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from scipy.stats import skew

In [46]:
def load_existing_features():
    """Load and prepare existing features with proper user tracking"""
    features_df = pd.read_csv('data/events_features.csv')
    # Rename the index column to user_id
    features_df = features_df.rename(columns={'Unnamed: 0': 'user_id'})
    features_df = features_df.set_index('user_id')
    return features_df

In [48]:
existing_features = load_existing_features()
existing_features

Unnamed: 0_level_0,ses_rec,ses_rec_avg,ses_rec_sd,ses_rec_cv,user_rec,ses_n,ses_n_r,int_n,view_count,cart_count,...,int_cat_n_avg,int_itm_n_avg,ses_mo_avg,ses_mo_sd,ses_hr_avg,ses_hr_sd,ses_wknd_r,ses_len_avg,time_to_int,time_to_tran
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1515915625519380000,44.131042,2.694893,8.004473,297.023819,157.501424,36,0.317543,56,55,1,...,0.472222,1.222222,9.035714,1.220496,13.125000,2.472117,0.071429,884.188426,2968.242727,0.000000
1515915625513230000,148.475637,4.510764,2.074016,45.979268,157.501181,3,0.332390,4,3,0,...,0.666667,1.000000,9.500000,0.577350,12.000000,0.816497,0.750000,1.927778,4332.261111,0.000000
1515915625519010000,85.436088,24.021674,19.545232,81.364985,157.501111,4,0.055505,7,7,0,...,0.750000,1.750000,9.714286,1.253566,13.285714,2.429972,0.714286,3.766667,17295.605556,0.000000
1515915625510740000,157.501076,0.000000,0.000000,0.000000,157.501076,1,1.000000,1,1,0,...,1.000000,1.000000,9.000000,0.000000,11.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1515915625519320000,29.892025,6.380198,11.645592,182.527130,157.500509,21,0.164566,36,34,2,...,0.476190,1.523810,9.555556,2.285913,12.361111,5.188739,0.194444,18.053175,5250.177619,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515915625610980000,0.111019,0.000778,0.002715,348.901265,0.181782,66,66.000000,102,93,6,...,0.303030,1.000000,2.000000,0.000000,19.509804,0.540354,1.000000,3.524242,1.008911,0.004485
1515915625610990000,0.013669,0.000394,0.000584,147.985436,0.156250,85,85.000000,131,110,12,...,0.317647,0.988235,2.000000,0.000000,20.099237,0.369094,1.000000,3.149020,1.579359,0.005307
1515915625611000000,0.089572,0.000546,0.000612,112.182623,0.127222,70,70.000000,97,91,5,...,0.300000,1.042857,2.000000,0.000000,20.845361,0.363439,1.000000,0.864048,0.564757,0.000000
1515915625611010000,0.009387,0.000876,0.002275,259.542344,0.087384,90,90.000000,160,147,12,...,0.233333,1.100000,2.000000,0.000000,22.218750,0.557080,1.000000,2.036111,0.706394,0.000000


In [50]:
df = pd.read_csv('data/events_cleaned.csv')
df['event_time'] = pd.to_datetime(df['event_time'])
ref_date = df['event_time'].max()

In [53]:
def create_engagement_features(df, ref_date):
    """Create new engagement-based features"""
    engagement = pd.DataFrame(index=df['user_id'].unique())
    
    # Inactivity Period
    last_sessions = df.groupby('user_id')['event_time'].max()
    engagement['inactive_days'] = (ref_date - last_sessions).dt.total_seconds() / (24 * 3600)
    
    # Peak Activity Hour and Day
    df['hour'] = df['event_time'].dt.hour
    df['day_of_week'] = df['event_time'].dt.dayofweek
    
    hour_counts = df.groupby(['user_id', 'hour']).size().unstack(fill_value=0)
    day_counts = df.groupby(['user_id', 'day_of_week']).size().unstack(fill_value=0)
    
    engagement['peak_activity_hr'] = hour_counts.idxmax(axis=1)
    engagement['peak_activity_day'] = day_counts.idxmax(axis=1)
    
    # Off-Hours Activity Rate (12 AM to 6 AM)
    total_interactions = df.groupby('user_id').size()
    off_hours = df[df['event_time'].dt.hour.between(0, 5)].groupby('user_id').size()
    engagement['off_hours_rate'] = (off_hours / total_interactions).fillna(0)
    
    return engagement

In [54]:
engagement_features = create_engagement_features(df, ref_date)
print("Engagement features:", engagement_features.columns.tolist())

Engagement features: ['inactive_days', 'peak_activity_hr', 'peak_activity_day', 'off_hours_rate']


In [55]:
engagement_features

Unnamed: 0,inactive_days,peak_activity_hr,peak_activity_day,off_hours_rate
1515915625519380000,44.131042,12,3,0.000000
1515915625513230000,148.475637,12,5,0.000000
1515915625519010000,85.436088,15,5,0.000000
1515915625510740000,157.501076,11,3,0.000000
1515915625519320000,29.892025,9,3,0.055556
...,...,...,...,...
1515915625610980000,0.111019,19,6,0.000000
1515915625610990000,0.013669,20,6,0.000000
1515915625611000000,0.089572,21,6,0.000000
1515915625611010000,0.009387,22,6,0.000000


In [56]:
def create_session_pattern_features(df):
    """Create new session pattern features"""
    session_patterns = pd.DataFrame(index=df['user_id'].unique())
    
    # Calculate session lengths
    session_data = df.groupby(['user_id', 'user_session']).agg(
        session_length=('event_time', lambda x: (x.max() - x.min()).total_seconds() / 60)
    )
    
    # Session length statistics
    session_patterns['ses_len_sd'] = session_data.groupby('user_id')['session_length'].std()
    session_patterns['ses_len_cv'] = (session_patterns['ses_len_sd'] / 
                                    session_data.groupby('user_id')['session_length'].mean())
    
    # Long and short session ratios
    long_sessions = (session_data['session_length'] > 30).groupby('user_id').mean()
    short_sessions = (session_data['session_length'] < 5).groupby('user_id').mean()
    session_patterns['long_ses_ratio'] = long_sessions
    session_patterns['short_ses_ratio'] = short_sessions
    
    # Session gaps
    session_times = df.groupby(['user_id', 'user_session'])['event_time'].min()
    session_gaps = session_times.groupby('user_id').diff().dt.total_seconds() / (24 * 3600)
    session_patterns['ses_gap_sd'] = session_gaps.groupby('user_id').std()
    
    return session_patterns

In [57]:
session_pattern_features = create_session_pattern_features(df)
print("Session pattern features:", session_pattern_features.columns.tolist())

Session pattern features: ['ses_len_sd', 'ses_len_cv', 'long_ses_ratio', 'short_ses_ratio', 'ses_gap_sd']


In [58]:
session_pattern_features

Unnamed: 0,ses_len_sd,ses_len_cv,long_ses_ratio,short_ses_ratio,ses_gap_sd
1515915625519380000,4607.604825,5.211112,0.055556,0.888889,25.986468
1515915625513230000,3.339009,1.732051,0.000000,0.666667,10.605783
1515915625519010000,7.533333,2.000000,0.000000,0.750000,52.829279
1515915625510740000,,,0.000000,1.000000,
1515915625519320000,71.361248,3.952837,0.047619,0.761905,44.954132
...,...,...,...,...,...
1515915625610980000,14.220453,4.035038,0.030303,0.909091,0.012182
1515915625610990000,18.857814,5.988472,0.023529,0.929412,0.012187
1515915625611000000,3.084320,3.569619,0.000000,0.957143,0.016753
1515915625611010000,7.468266,3.667907,0.011111,0.888889,0.027727


In [59]:
def create_interaction_behavior_features(df, ref_date):
    """Create new interaction behavior features"""
    interaction = pd.DataFrame(index=df['user_id'].unique())
    
    # Interaction recency
    last_interaction = df.groupby('user_id')['event_time'].max()
    interaction['int_rec'] = (ref_date - last_interaction).dt.total_seconds() / (24 * 3600)
    
    # Time between interactions
    df_sorted = df.sort_values(['user_id', 'event_time'])
    df_sorted['next_time'] = df_sorted.groupby('user_id')['event_time'].shift(-1)
    df_sorted['time_gap'] = (df_sorted['next_time'] - df_sorted['event_time']).dt.total_seconds() / 60
    interaction['int_rec_sd'] = df_sorted.groupby('user_id')['time_gap'].std()
    
    # View-to-cart and cart-to-purchase times
    def calculate_transition_time(group, from_event, to_event):
        from_times = group[group['event_type'] == from_event]['event_time']
        to_times = group[group['event_type'] == to_event]['event_time']
        if len(from_times) == 0 or len(to_times) == 0:
            return np.nan
        return (min(to_times) - min(from_times)).total_seconds() / 60
    
    transitions = df.groupby(['user_id', 'user_session']).apply(
        lambda x: pd.Series({
            'view_to_cart': calculate_transition_time(x, 'view', 'cart'),
            'cart_to_purchase': calculate_transition_time(x, 'cart', 'purchase')
        })
    )
    
    interaction['view_to_cart_avg'] = transitions.groupby('user_id')['view_to_cart'].mean()
    interaction['cart_to_purchase_avg'] = transitions.groupby('user_id')['cart_to_purchase'].mean()
    
    # Interaction skewness
    interaction_counts = df.groupby(['user_id', 'user_session']).size()
    interaction['int_skew'] = interaction_counts.groupby('user_id').apply(lambda x: skew(x))
    
    return interaction

In [60]:
interaction_features = create_interaction_behavior_features(df, ref_date)
print("Interaction features:", interaction_features.columns.tolist())

  transitions = df.groupby(['user_id', 'user_session']).apply(
  interaction['int_skew'] = interaction_counts.groupby('user_id').apply(lambda x: skew(x))


Interaction features: ['int_rec', 'int_rec_sd', 'view_to_cart_avg', 'cart_to_purchase_avg', 'int_skew']


In [61]:
interaction_features

Unnamed: 0,int_rec,int_rec_sd,view_to_cart_avg,cart_to_purchase_avg,int_skew
1515915625519380000,44.131042,9907.394884,0.616667,,3.988816
1515915625513230000,148.475637,4301.005406,,,0.707107
1515915625519010000,85.436088,25990.000019,,,1.154701
1515915625510740000,157.501076,,,,
1515915625519320000,29.892025,13338.136911,0.033333,,1.598882
...,...,...,...,...,...
1515915625610980000,0.111019,2.752214,3.622222,1.541667,2.950192
1515915625610990000,0.013669,9.497503,2.888889,1.170833,3.511591
1515915625611000000,0.089572,0.599000,3.540000,1.200000,3.674598
1515915625611010000,0.009387,1.617989,2.554167,4.466667,3.948167


In [62]:
def create_revenue_pattern_features(df):
    """Create new revenue pattern features"""
    revenue = pd.DataFrame(index=df['user_id'].unique())
    
    # High spending spike flag
    purchase_data = df[df['event_type'] == 'purchase'].copy()
    avg_spending = purchase_data.groupby('user_id')['price'].mean()
    max_spending = purchase_data.groupby('user_id')['price'].max()
    revenue['high_spike_flag'] = (max_spending > (2 * avg_spending)).astype(int)
    
    # Revenue per interaction
    total_revenue = purchase_data.groupby('user_id')['price'].sum()
    total_interactions = df.groupby('user_id').size()
    revenue['rev_per_int'] = total_revenue / total_interactions
    
    # Purchase consistency
    revenue['purchase_consistency'] = purchase_data.groupby('user_id')['price'].std()
    
    return revenue

In [63]:
revenue_features = create_revenue_pattern_features(df)
print("Revenue features:", revenue_features.columns.tolist())

Revenue features: ['high_spike_flag', 'rev_per_int', 'purchase_consistency']


In [64]:
revenue_features

Unnamed: 0,high_spike_flag,rev_per_int,purchase_consistency
1515915625519380000,,,
1515915625513230000,0.0,35.157500,
1515915625519010000,,,
1515915625510740000,,,
1515915625519320000,,,
...,...,...,...
1515915625610980000,0.0,12.001471,102.000288
1515915625610990000,0.0,22.543435,143.652731
1515915625611000000,0.0,0.400309,
1515915625611010000,0.0,0.094250,


In [65]:
def create_category_consistency_features(df):
    """Create new category consistency features"""
    category = pd.DataFrame(index=df['user_id'].unique())
    
    # Popular category consistency
    cat_sessions = df.groupby(['user_id', 'user_session', 'category_code']).size().reset_index()
    most_frequent_cat = cat_sessions.groupby(['user_id', 'category_code']).size()
    total_sessions = df.groupby('user_id')['user_session'].nunique()
    category['pop_cat_consistency'] = (most_frequent_cat.groupby('user_id').max() / total_sessions)
    
    # Cross-category ratio
    multi_cat_sessions = (cat_sessions.groupby(['user_id', 'user_session']).size() > 1)
    category['cross_cat_ratio'] = multi_cat_sessions.groupby('user_id').mean()
    
    return category

In [66]:
category_features = create_category_consistency_features(df)
print("Category features:", category_features.columns.tolist())

Category features: ['pop_cat_consistency', 'cross_cat_ratio']


In [67]:
category_features

Unnamed: 0,pop_cat_consistency,cross_cat_ratio
1515915625519380000,0.472222,0.055556
1515915625513230000,0.666667,0.000000
1515915625519010000,0.500000,0.000000
1515915625510740000,1.000000,0.000000
1515915625519320000,0.333333,0.095238
...,...,...
1515915625610980000,0.378788,0.015152
1515915625610990000,0.435294,0.011765
1515915625611000000,0.385714,0.057143
1515915625611010000,0.511111,0.022222


In [68]:
# Combine all features
all_features = pd.concat([
    existing_features,
    engagement_features,
    session_pattern_features,
    interaction_features,
    revenue_features,
    category_features
], axis=1)

# Fill NaN values appropriately
all_features = all_features.fillna(0)

In [69]:
print("\nFinal feature matrix shape:", all_features.shape)


Final feature matrix shape: (13353, 49)


In [70]:
# Save updated features
all_features.to_csv('data/events_features_extended.csv')

In [72]:
all_features.columns

Index(['ses_rec', 'ses_rec_avg', 'ses_rec_sd', 'ses_rec_cv', 'user_rec',
       'ses_n', 'ses_n_r', 'int_n', 'view_count', 'cart_count',
       'purchase_count', 'int_n_r', 'tran_n', 'tran_n_r', 'rev_sum',
       'rev_per_purchase', 'rev_sum_r', 'major_spend_r', 'int_cat_n',
       'int_itm_n', 'int_cat_n_avg', 'int_itm_n_avg', 'ses_mo_avg',
       'ses_mo_sd', 'ses_hr_avg', 'ses_hr_sd', 'ses_wknd_r', 'ses_len_avg',
       'time_to_int', 'time_to_tran', 'inactive_days', 'peak_activity_hr',
       'peak_activity_day', 'off_hours_rate', 'ses_len_sd', 'ses_len_cv',
       'long_ses_ratio', 'short_ses_ratio', 'ses_gap_sd', 'int_rec',
       'int_rec_sd', 'view_to_cart_avg', 'cart_to_purchase_avg', 'int_skew',
       'high_spike_flag', 'rev_per_int', 'purchase_consistency',
       'pop_cat_consistency', 'cross_cat_ratio'],
      dtype='object')

In [74]:
all_features

Unnamed: 0,ses_rec,ses_rec_avg,ses_rec_sd,ses_rec_cv,user_rec,ses_n,ses_n_r,int_n,view_count,cart_count,...,int_rec,int_rec_sd,view_to_cart_avg,cart_to_purchase_avg,int_skew,high_spike_flag,rev_per_int,purchase_consistency,pop_cat_consistency,cross_cat_ratio
1515915625519380000,44.131042,2.694893,8.004473,297.023819,157.501424,36,0.317543,56,55,1,...,44.131042,9907.394884,0.616667,0.000000,3.988816,0.0,0.000000,0.000000,0.472222,0.055556
1515915625513230000,148.475637,4.510764,2.074016,45.979268,157.501181,3,0.332390,4,3,0,...,148.475637,4301.005406,0.000000,0.000000,0.707107,0.0,35.157500,0.000000,0.666667,0.000000
1515915625519010000,85.436088,24.021674,19.545232,81.364985,157.501111,4,0.055505,7,7,0,...,85.436088,25990.000019,0.000000,0.000000,1.154701,0.0,0.000000,0.000000,0.500000,0.000000
1515915625510740000,157.501076,0.000000,0.000000,0.000000,157.501076,1,1.000000,1,1,0,...,157.501076,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,1.000000,0.000000
1515915625519320000,29.892025,6.380198,11.645592,182.527130,157.500509,21,0.164566,36,34,2,...,29.892025,13338.136911,0.033333,0.000000,1.598882,0.0,0.000000,0.000000,0.333333,0.095238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515915625610980000,0.111019,0.000778,0.002715,348.901265,0.181782,66,66.000000,102,93,6,...,0.111019,2.752214,3.622222,1.541667,2.950192,0.0,12.001471,102.000288,0.378788,0.015152
1515915625610990000,0.013669,0.000394,0.000584,147.985436,0.156250,85,85.000000,131,110,12,...,0.013669,9.497503,2.888889,1.170833,3.511591,0.0,22.543435,143.652731,0.435294,0.011765
1515915625611000000,0.089572,0.000546,0.000612,112.182623,0.127222,70,70.000000,97,91,5,...,0.089572,0.599000,3.540000,1.200000,3.674598,0.0,0.400309,0.000000,0.385714,0.057143
1515915625611010000,0.009387,0.000876,0.002275,259.542344,0.087384,90,90.000000,160,147,12,...,0.009387,1.617989,2.554167,4.466667,3.948167,0.0,0.094250,0.000000,0.511111,0.022222
