In [4]:
import numpy as np
import pandas as pd
import scipy.stats as stats

# Disable warnings from printing
from warnings import filterwarnings
filterwarnings('ignore')

In [5]:
# Read the sessions data
sessions_data = pd.read_csv("../Data/sessions.csv")

In [6]:
# Group all null user_ids into "NAN" group
sessions_data[sessions_data.user_id.isnull()].user_id = "NAN"

In [7]:
# Get unique user ids to store results
sessions_data_ids = sessions_data.loc[:, ["user_id"]].drop_duplicates().reset_index().drop("index", axis=1)

In [8]:
# Get user ids and actions for action processing
sessions_data_action = sessions_data.loc[:, ["user_id", "action"]]

In [9]:
# Processing for Action attribute
# Replace null values with 'NAN'
sessions_data_action.action = sessions_data_action.action.fillna("NAN")

# Replace infrequent values with 'Other' to avoid overfitting and speed up computation
THRESHOLD = 0.005*sessions_data_action.shape[0]
counts = sessions_data_action.action.value_counts()
sessions_data_action.action = sessions_data_action.action.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

In [10]:
# Get counts of each action for each user
unique_actions = sessions_data_action["action"].unique()

for act in unique_actions:
    sessions_data_ids = sessions_data_ids.join(
        sessions_data_action.loc[sessions_data_action.action == act, :].groupby(['user_id']).size().to_frame(),
        on="user_id")
    sessions_data_ids.rename(columns={0 : "action_" + act}, inplace=True)
    
sessions_data_ids = sessions_data_ids.fillna(0)

In [11]:
# Get number of sessions of each user
sessions_data_ids = sessions_data_ids.join(
    sessions_data_action.groupby(['user_id'])["user_id"].size().to_frame(),
    on="user_id")
sessions_data_ids.rename(columns={0 : "NumSessions"}, inplace=True)

In [12]:
# Get mean and std of distribution of counts of actions for each user
sessions_data_ids["NumActionsMean"] = \
sessions_data_ids.loc[:, "action_lookup":"action_similar_listings_v2"].mean(axis=1)

sessions_data_ids["NumActionsStd"] = \
sessions_data_ids.loc[:, "action_lookup":"action_similar_listings_v2"].std(axis=1)

In [13]:
# Get user ids and action_type for action_type processing
sessions_data_action_type = sessions_data.loc[:, ["user_id", "action_type"]]

In [14]:
# Processing for action_type attribute
# Replace null values with 'NAN'
sessions_data_action_type.action_type = sessions_data_action_type.action_type.fillna("NAN")
sessions_data_action_type.action_type = sessions_data_action_type.action_type.replace(
    {
        '-unknown-': 'NAN'
    }
)

# Replace infrequent values with 'Other' to avoid overfitting and speed up computation
THRESHOLD = 0.005*sessions_data_action_type.shape[0]
counts = sessions_data_action_type.action_type.value_counts()
sessions_data_action_type.action_type = \
sessions_data_action_type.action_type.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

In [15]:
# Get counts of each action_type for each user
unique_actions = sessions_data_action_type["action_type"].unique()

for act in unique_actions:
    sessions_data_ids = sessions_data_ids.join(
        sessions_data_action_type.loc[sessions_data_action_type.action_type == act, :].groupby(
            ['user_id']).size().to_frame(),
        on="user_id")
    sessions_data_ids.rename(columns={0 : "action_type_" + act}, inplace=True)
    
sessions_data_ids = sessions_data_ids.fillna(0)

In [16]:
# Get mean and std of distribution of counts of action_type for each user
sessions_data_ids["NumActionTypeMean"] = \
sessions_data_ids.loc[:, "action_type_NAN":"action_type_Other"].mean(axis=1)

sessions_data_ids["NumActionTypeStd"] = \
sessions_data_ids.loc[:, "action_type_NAN":"action_type_Other"].std(axis=1)

In [17]:
# Repeat procedure for action_detail
# Get user ids and action_detail for action_detail processing
sessions_data_action_detail = sessions_data.loc[:, ["user_id", "action_detail"]]

# Processing for action_detail attribute
# Replace null values with 'NAN'
sessions_data_action_detail.action_detail = sessions_data_action_detail.action_detail.fillna("NAN")
sessions_data_action_detail.action_detail = sessions_data_action_detail.action_detail.replace(
    {
        '-unknown-': 'NAN'
    }
)

# Replace infrequent values with 'Other' to avoid overfitting and speed up computation
THRESHOLD = 0.005*sessions_data_action_detail.shape[0]
counts = sessions_data_action_detail.action_detail.value_counts()
sessions_data_action_detail.action_detail = \
sessions_data_action_detail.action_detail.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

# Get counts of each action_type for each user
unique_actions = sessions_data_action_detail["action_detail"].unique()

for act in unique_actions:
    sessions_data_ids = sessions_data_ids.join(
        sessions_data_action_detail.loc[sessions_data_action_detail.action_detail == act, :].groupby(
            ['user_id']).size().to_frame(),
        on="user_id")
    sessions_data_ids.rename(columns={0 : "action_detail_" + act}, inplace=True)
    
sessions_data_ids = sessions_data_ids.fillna(0)

In [18]:
# Get mean and std of distribution of counts of action_detail for each user
sessions_data_ids["NumActionDetailMean"] = \
sessions_data_ids.loc[:, "action_detail_NAN":"action_detail_listing_reviews"].mean(axis=1)

sessions_data_ids["NumActionDetailStd"] = \
sessions_data_ids.loc[:, "action_detail_NAN":"action_detail_listing_reviews"].std(axis=1)

In [19]:
# Repeat the procedure for device_type
# Get user ids and device_type for device_type processing
sessions_data_device_type = sessions_data.loc[:, ["user_id", "device_type"]]

# Processing for device_type attribute
# Replace null values with 'NAN'
sessions_data_device_type.device_type = sessions_data_device_type.device_type.fillna("NAN")
sessions_data_device_type.device_type = sessions_data_device_type.device_type.replace(
    {
        '-unknown-': 'NAN'
    }
)

# Replace infrequent values with 'Other' to avoid overfitting and speed up computation
THRESHOLD = 0.005*sessions_data_device_type.shape[0]
counts = sessions_data_device_type.device_type.value_counts()
sessions_data_device_type.device_type = \
sessions_data_device_type.device_type.apply(lambda x: 'Other' if counts[x] < THRESHOLD else x)

# Get counts of each action_type for each user
unique_actions = sessions_data_device_type["device_type"].unique()

for act in unique_actions:
    sessions_data_ids = sessions_data_ids.join(
        sessions_data_device_type.loc[sessions_data_device_type.device_type == act, :].groupby(
            ['user_id']).size().to_frame(),
        on="user_id")
    sessions_data_ids.rename(columns={0 : "device_type_" + act}, inplace=True)
    
sessions_data_ids = sessions_data_ids.fillna(0)

In [20]:
# Get mean and std of distribution of counts of device_type for each user
sessions_data_ids["NumDeviceTypeMean"] = \
sessions_data_ids.loc[:, "device_type_Windows Desktop":"device_type_Tablet"].mean(axis=1)

sessions_data_ids["NumDeviceTypeStd"] = \
sessions_data_ids.loc[:, "device_type_Windows Desktop":"device_type_Tablet"].std(axis=1)

In [24]:
# Get user ids and secs_elapsed for secs_elapsed processing
sessions_data_secs_elapsed = sessions_data.loc[:, ["user_id", "secs_elapsed"]]
sessions_data_secs_elapsed.secs_elapsed = sessions_data_secs_elapsed.secs_elapsed.fillna(0)

# Get simple stats on secs_elapsed
tmp = sessions_data_secs_elapsed.groupby('user_id').aggregate(
    [
        np.mean, np.std, np.median, stats.skew
    ]
)

sessions_data_ids = sessions_data_ids.join(tmp, on="user_id")

In [25]:
sessions_data_ids.to_csv("../Data/sessions_processed.csv", header=True)