# Load Dependencies

In [1]:
from functools import reduce
import pandas as pd
import config as cfg
import warnings
warnings.filterwarnings('ignore')

# Load Datasets

In [2]:
call_df = pd.read_csv(cfg.CALL_DATA_PTH, sep='\t', encoding='utf-8')
message_df = pd.read_csv(cfg.MSG_DATA_PTH, sep='\t', encoding='utf-8')
search_df = pd.read_csv(cfg.SEARCH_DATA_PTH, sep='\t', encoding='utf-8')
signup_df = pd.read_csv(cfg.SIGNUP_DATA_PTH, sep='\t', encoding='utf-8')


call_df['call_datetime'] = pd.to_datetime(call_df['call_ts'], unit='ms')
message_df['message_datetime'] = pd.to_datetime(message_df['message_ts'], unit='ms')
search_df['search_datetime'] = pd.to_datetime(search_df['search_ts'], unit='ms')
signup_df['signup_datetime'] = pd.to_datetime(signup_df['signup_ts'], unit='ms')

# Feature Engineering

## Activity Features

In [3]:
print("=== USER DATAMART ===\n")

features = signup_df[['user_id', 'country_code', 'signup_datetime']].copy()

print(f"Country Distribution:")
call_counts = call_df.groupby('user_id').size().reset_index(name='total_calls')
message_counts = message_df.groupby('user_id').size().reset_index(name='total_messages') 
search_counts = search_df.groupby('user_id').size().reset_index(name='total_searches')

features = features.merge(call_counts, on='user_id', how='left')
features = features.merge(message_counts, on='user_id', how='left')
features = features.merge(search_counts, on='user_id', how='left')
features = features.fillna(0)

features.head(5)

=== USER DATAMART ===

Country Distribution:


Unnamed: 0,user_id,country_code,signup_datetime,total_calls,total_messages,total_searches
0,10000000,IT,2019-03-18,36.0,1.0,16
1,10000001,IT,2019-03-03,64.0,0.0,20
2,10000002,SE,2019-04-02,5.0,2.0,17
3,10000003,IT,2019-03-27,60.0,0.0,38
4,10000004,IT,2019-04-01,24.0,0.0,37


## First/Last Transaction Features

In [4]:
for df_name, df, activity in [('call', call_df, 'call'), ('message', message_df, 'message'),('search', search_df, 'search')]:
    
    # First and last activity
    activity_stats = df.groupby('user_id')[f'{activity}_datetime'].agg(['min', 'max', 'count']).reset_index()
    activity_stats.columns = ['user_id', f'first_{activity}', f'last_{activity}', f'{activity}_frequency']
    
    features = features.merge(activity_stats, on='user_id', how='left')
    
    # Days since signup to first activity
    features[f'days_signup_to_first_{activity}'] = (
        features[f'first_{activity}'] - features['signup_datetime']
    ).dt.total_seconds() / 86400  # Convert to days
    
    # Activity span in days
    features[f'{activity}_span_days'] = (
        features[f'last_{activity}'] - features[f'first_{activity}']
    ).dt.total_seconds() / 86400

features.head(5)

Unnamed: 0,user_id,country_code,signup_datetime,total_calls,total_messages,total_searches,first_call,last_call,call_frequency,days_signup_to_first_call,...,first_message,last_message,message_frequency,days_signup_to_first_message,message_span_days,first_search,last_search,search_frequency,days_signup_to_first_search,search_span_days
0,10000000,IT,2019-03-18,36.0,1.0,16,2019-03-19 07:41:26.148,2019-04-03 22:05:00.575,36.0,1.320442,...,2019-04-02 16:55:06.197,2019-04-02 16:55:06.197,1.0,15.704933,0.0,2019-03-18 19:06:00.635,2019-03-25 17:39:25.361,16,0.795841,6.93987
1,10000001,IT,2019-03-03,64.0,0.0,20,2019-03-04 00:44:47.577,2019-03-18 16:54:08.356,64.0,1.031106,...,NaT,NaT,,,,2019-03-03 13:57:40.041,2019-03-18 15:51:11.324,20,0.581713,15.078834
2,10000002,SE,2019-04-02,5.0,2.0,17,2019-04-02 22:10:07.048,2019-04-03 17:28:01.974,5.0,0.923693,...,2019-04-03 12:04:51.344,2019-04-03 15:06:18.428,2.0,1.503372,0.126008,2019-04-02 13:09:23.146,2019-04-03 14:17:52.512,17,0.548185,1.047562
3,10000003,IT,2019-03-27,60.0,0.0,38,2019-03-27 16:51:28.872,2019-04-03 21:33:40.076,60.0,0.702418,...,NaT,NaT,,,,2019-03-27 11:55:39.487,2019-04-02 09:51:20.150,38,0.496985,5.913665
4,10000004,IT,2019-04-01,24.0,0.0,37,2019-04-02 04:02:43.526,2019-04-03 18:09:55.061,24.0,1.168559,...,NaT,NaT,,,,2019-04-01 21:10:00.782,2019-04-03 10:45:45.179,37,0.881953,1.566486


## Behavioral Features - Activity Rates

In [5]:
features[f'call_rate_per_day'] = features[f'total_calls'] / (features[f'call_span_days'] + 1)
features[f'call_message_per_day'] = features[f'total_messages'] / (features[f'message_span_days'] + 1)
features[f'call_searches_per_day'] = features[f'total_searches'] / (features[f'search_span_days'] + 1)
features['total_activity'] = features['total_calls'] + features['total_messages'] + features['total_searches']

## Behavioral Features - Scraper Specific

In [6]:
# High search to call/message ratio (scrapers search more than they communicate)
features['search_to_communication_ratio'] = ( features['total_searches'] / (features['total_calls'] + features['total_messages'] + 1))

# Immediate activity after signup (bots often start immediately)
features['immediate_activity'] = (
    (features['days_signup_to_first_call'] <= 1) |
    (features['days_signup_to_first_message'] <= 1) |
    (features['days_signup_to_first_search'] <= 1)
).astype(int)

# High activity concentration
features['activity_concentration'] = features['total_activity'] / (
    features[['call_span_days', 'message_span_days', 'search_span_days']].max(axis=1) + 1
)

In [7]:
def compute_interval_stats(df, user_col, time_col, prefix):
    # Sort by user and time
    df = df.sort_values([user_col, time_col])
    
    # Calculate interval (in seconds)
    df['interval'] = df.groupby(user_col)[time_col].diff() / 1000  # ms to seconds
    
    # Drop rows where diff is NaN (i.e., first entry per user)
    df = df.dropna(subset=['interval'])

    # Aggregate stats per user
    agg_df = df.groupby(user_col)['interval'].agg([
        ('{}_interval_mean_sec'.format(prefix), 'mean'),
        ('{}_interval_min_sec'.format(prefix), 'min'),
        ('{}_interval_std_sec'.format(prefix), 'std')
    ]).reset_index()

    return agg_df

search_stats = compute_interval_stats(search_df, 'user_id', 'search_ts', 'search')
call_stats = compute_interval_stats(call_df, 'user_id', 'call_ts', 'call')
msg_stats = compute_interval_stats(message_df, 'user_id', 'message_ts', 'message')

dfs = [search_stats, call_stats, msg_stats]
user_stats = reduce(lambda left, right: pd.merge(left, right, on='user_id', how='outer'), dfs)

features = features.merge(user_stats, on='user_id', how='left')

In [8]:
features.to_csv(cfg.USER_FEATURES_ENG_DATA_PTH)