In [3]:
# For data handling
import pandas as pd
import numpy as np
import scipy.stats as stats

# Disable warnings from printing
from warnings import filterwarnings
filterwarnings('ignore')

In [4]:
# Read the sessions data
sessions = pd.read_csv("../Data/sessions.csv")
sessions.set_index("user_id", inplace=True)
sessions.fillna(-1, inplace=True)

In [5]:
# Replace infrequent values with 'Other' to avoid overfitting
THRESHOLD = 0.005*sessions.shape[0]

counts = sessions.action.value_counts()
sessions.action = sessions.action.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

counts = sessions.action_type.value_counts()
sessions.action_type = sessions.action_type.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

counts = sessions.device_type.value_counts()
sessions.device_type = sessions.device_type.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

counts = sessions.action_detail.value_counts()
sessions.action_detail = sessions.action_detail.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

In [None]:
# For each user get the count of different values of each categorical attribute
column_list = ["action", "action_type", "device_type", "action_detail"]
df_extracted_sessions = []

for col in column_list:
    for val in sessions[col].unique():
        tmp_df = sessions.groupby(sessions.index).apply(lambda group, x=col, y=val: np.sum(group[x] == y))
        tmp_df.name = '%s=%s' % (col, val)
        df_extracted_sessions.append(tmp_df)
                           
frequency_counts = pd.concat(df_extracted_sessions, axis=1)

In [None]:
secs_elapsed_data = sessions["secs_elapsed"].groupby(sessions.index).aggregate(
    [np.mean, np.std, np.median, stats.skew])
secs_elapsed_data = ['%s_%s' % ("secs_elapsed", i) for i in secs_elapsed_data.columns]

In [None]:
sessions_processed = pd.concat((frequency_counts, secs_elapsed_data), axis=1)
sessions_processed.fillna(-1, inplace=True)
sessions_processed.to_csv("../Data/sessions_processed.csv", header=True)