In [1]:
import numpy as np
import pandas as pd
import collections # counter

from sklearn import preprocessing
from sklearn.impute import SimpleImputer # missing data
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.compose import ColumnTransformer

In [2]:
# folder = 'csv_he'
# folder = 'csv_pp_500'
folder = 'csv_b_500'
# selection_method = 1
selection_method = 2

In [3]:
df_users = pd.read_csv(folder + '/df_users_dom_' + str(selection_method) + '.csv')
df_pageviews = pd.read_csv(folder + '/df_pageviews_metrics_' + str(selection_method) + '.csv')
df_sessions = pd.read_csv(folder + '/df_sessions_metrics_' + str(selection_method) + '.csv')

In [4]:
pd.options.display.max_columns = None

In [5]:
columns = ['n_load', 'n_rage_click', 'n_wild_mouse', 'n_unique_product', 'n_unique_category',
           'mean_n_ses_between_buys', 'buy_count', 'n_buy', 'day', 'referrer2', 'referrer3', 'started_at', 'duration']
df_users = df_users.drop(columns, axis=1)

In [6]:
# n_next_page - only He
columns = ['n_pageviews', 'n_input', 'n_click', 'n_mouse_click', 'n_mouse_move',
           'n_scroll_move', 'n_scrandom', 'n_events', 'n_product', 'n_non_product', 
           'n_category', 'n_filter', 'n_search', 'n_cart', 'n_add_to_cart', 'n_remove_from_cart', 
           'n_next_page',
           'effective_duration', 'pv_product_mean_eff_duration', 'load_time']

In [7]:
data = pd.DataFrame()

In [8]:
# split each column to buy/no buy
for user in df_users['user_id'].unique():
    for column in columns:
        buy_true = 0
        buy_false = 0
        n_buy = 0
        n_no_buy = 0
        
        for session in df_users[df_users['user_id'] == user]['session_id']:
            val = df_users[df_users['session_id'] == session][column].values[0]
            buy = df_users[df_users['session_id'] == session]['buy'].values[0]
            if buy == 1:
                buy_true += val
                n_buy += 1
            if buy == 0:
                buy_false += val
                n_no_buy += 1
        df_users.loc[df_users['user_id'] == user, column + '_buy'] = round(buy_true/n_buy)
        df_users.loc[df_users['user_id'] == user, column + '_no_buy'] = round(buy_false/n_no_buy) if n_no_buy != 0 else 0
    df_users.loc[df_users['user_id'] == user,'num_of_sessions'] = n_no_buy + n_buy

In [11]:
len(df_users[(df_users['repeat_buyer'] == True) & (df_users['n_pageviews_no_buy'] == 0)]['user_id'].unique())

78

In [13]:
df_users = df_users.drop(columns,axis=1)

In [14]:
data = pd.DataFrame()
columns = ['country', 'city', 'city_type', 'continent', 'region', 'device.type', 'browser.name', 'os.name', 'referrer1']
for column in columns:    
    data[column] = df_users.groupby(['user_id'])[column].apply(lambda x: pd.Series.mode(x)[0])

In [15]:
data = data.reset_index()
df_users = df_users.drop(columns, axis = 1)

In [16]:
df_users = pd.merge(
    df_users,
    data,
    on='user_id',
    how='left'
)

In [17]:
df_users.columns

Index(['user_id', 'session_id', 'buy', 'n_sessions', 'hour', 'day_name',
       'buy_hour', 'buy_day', 'repeat_buyer', 'n_sessions_after_1_buy',
       'top_product', 'n_top_product', 'top_category', 'n_top_category',
       'top_product_u', 'n_top_product_u', 'n_unique_product_u',
       'top_category_u', 'n_top_category_u', 'n_unique_category_u',
       'user_mean_eff_duration', 'mean_price', 'n_pageviews_buy',
       'n_pageviews_no_buy', 'n_input_buy', 'n_input_no_buy', 'n_click_buy',
       'n_click_no_buy', 'n_mouse_click_buy', 'n_mouse_click_no_buy',
       'n_mouse_move_buy', 'n_mouse_move_no_buy', 'n_scroll_move_buy',
       'n_scroll_move_no_buy', 'n_scrandom_buy', 'n_scrandom_no_buy',
       'n_events_buy', 'n_events_no_buy', 'n_product_buy', 'n_product_no_buy',
       'n_non_product_buy', 'n_non_product_no_buy', 'n_category_buy',
       'n_category_no_buy', 'n_filter_buy', 'n_filter_no_buy', 'n_search_buy',
       'n_search_no_buy', 'n_cart_buy', 'n_cart_no_buy', 'n_add_to_

In [18]:
def set_buy_time(hour):
    if (hour > 2 and hour <= 10):
        return 1
    elif (hour <= 18):
        return 2
    else:
        return 3

def set_buy_day(day):
    weekday = ['Monday','Tuesday','Thursday','Wednesday','Friday']
    weekend = ['Saturday','Sunday']
    if (day in weekday):
        return 1
    else:
        return 2

In [19]:
df_users['buy_time'] = df_users['buy_hour'].apply(lambda x: 0 if x == '-' else set_buy_time(float(x)))
df_users['time'] = df_users['hour'].apply(lambda x: 0 if x == '-' else set_buy_time(float(x)))
df_users['buy_day'] = df_users['buy_day'].apply(lambda x: 0 if x == '-' else set_buy_day((x)) )
df_users['day'] = df_users['day_name'].apply(lambda x: 0 if x == '-' else set_buy_day((x)) )

In [20]:
columns = ['hour', 'day_name', 'buy_hour']
df_users = df_users.drop(columns,axis=1)

In [22]:
for i in df_users['time'].unique():
    df_users['buy_time_' + str(i)] = 0
    df_users['time_' + str(i)] = 0
    
for i in df_users['day'].unique():
    df_users['buy_day_' + str(i)] = 0
    df_users['day_' + str(i)] = 0

In [23]:
i = 0
columns = ['buy_time', 'time', 'buy_day', 'day']
for user in df_users['user_id'].unique():
    for column in columns:
        for session in df_users[df_users['user_id'] == user]['session_id']:
            val = df_users[df_users['session_id'] == session][column].values[0]
            if val != 0:
                df_users.loc[df_users['user_id'] == user, column + '_' + str(val)] += 1

In [24]:
df_users = df_users.drop(columns,axis=1)

In [25]:
columns = ['user_id', 'session_id', 'buy', 'top_product', 'n_top_product', 'top_category', 'n_top_category']
df_users = df_users.drop(columns,axis=1)

In [26]:
df_users = df_users.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)
len(df_users)

904

In [27]:
for column in df_users.columns:
    df_users[column].fillna('-',inplace=True)

In [28]:
df_users.to_csv(folder + '/df_users_transform_' + str(selection_method) + '.csv', index=False, encoding='utf-8-sig')