In [1]:
import json
import pandas as pd
import glob
import time
from tqdm import tqdm
import sys,os
import collections
import re
import csv
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import itertools
import statistics
import scipy.stats as ss
import seaborn as sns
from copy import *

In [2]:
pd.options.display.max_columns = None

In [3]:
# project = 'he'
project = 'pp'
# project = 'ba'

if project == 'he':
    folder = 'csv_he'
    url_regex = r'((?<=https:\/\/www.herbatica.sk\/)\S+)'
    
elif project == 'pp':
    folder = 'csv_pp_500'
    url_regex = r'((?<=https:\/\/particlepeptides.com\/)\S+)'

elif project == 'ba':
    folder = 'csv_b_500'
    url_regex = r'((?<=https:\/\/www.barefootky.sk\/)\S+)'


In [4]:
df_inactivities = pd.read_csv(folder + '/df_inactivities_all.csv')
df_input = pd.read_csv(folder + '/df_input_all.csv')
df_load = pd.read_csv(folder + '/df_load_all.csv')
df_click = pd.read_csv(folder + '/df_click_all.csv')
df_mouse_click = pd.read_csv(folder + '/df_mouse_click_all.csv')
df_rage_click = pd.read_csv(folder + '/df_rage_click_all.csv')
df_mouse_move = pd.read_csv(folder + '/df_mouse_move_all.csv')
df_scroll_move = pd.read_csv(folder + '/df_scroll_move_all.csv')
df_wild_mouse = pd.read_csv(folder + '/df_wild_mouse_all.csv')
df_scrandom = pd.read_csv(folder + '/df_scrandom_all.csv')
df_pageviews = pd.read_csv(folder + '/df_pageviews_all.csv')
df_sessions = pd.read_csv(folder + '/df_sessions_all.csv')

In [5]:
df_pageviews['url1'] = df_pageviews['url'].str.extract(url_regex) # url1 - everything behind https://www.detskapostel.com/
df_pageviews['url2'] = df_pageviews['url1'].str.extract(r'((?<=\/)\S+)') # 2. part from url1 without / at the end
df_pageviews['url3'] = df_pageviews['url2'].str.extract(r'((?<=\/)\S+)') # 2. part from url2 without / at the end

df_pageviews['url1'] = df_pageviews['url1'].str.extract(r'^([^\/]*)') # 1. part from url1 without /
df_pageviews['url2'] = df_pageviews['url2'].str.extract(r'^([^\/]*)') # 1. part from url2 without /
df_pageviews['url3'] = df_pageviews['url3'].str.extract(r'^([^\/]*)') # 1. part from url3 without /    

In [6]:
# do df_pageviews nám pribudli tieto stĺpce:
# - pageview_non_product (True/False) - nonproduct pages
# - pageview_category (True/False) - category pages
# - pageview_product (True/False) - product pages
# - pageview_search - search
# - pageview_buy - purchase completion page
# - pageview_cart - cart page
# - pageview_filter (True/False) - filter in category (pagination and google ads clicks)
#     - if filter = 1, category = 1
#
# pageview_non_product + pageview_category + pageview_product + pageview_search + pageview_buy + pageview_cart = df_pageviews

In [7]:
if (project == 'he'):
    non_product_pages = [
        'vernost',
        'newsletter',
        'blog_herbatica',
        'blog',
        'doprava_platby',
        'certifikaty_produktov',
        'obchodne-podmienky',
        'poou',
        'o-nas',
        'faq',
        'registracia',
        'klient',
        'login',
        'napiste-nam',
        'logout',
        'affiliate-registracia',
        'affiliate-login',
        'affiliate-zabudnute-heslo',
        'user',
    ]
    search = [
        'vyhladavanie',
    ]
    buy = [
        'dakujeme',
    ]
    cart = [
        'kosik',
        'objednavka',
    ]
    pagination = 'strana-'
    filtering = '?'
    
    # nonproduct page / empty url
    df_pageviews.loc[(df_pageviews['url1'].isin(non_product_pages)) | (df_pageviews['url1'].isnull()), 'pageview_non_product'] = True

    df_pageviews['pageview_search'] = df_pageviews['url1'].isin(search) # search page
    df_pageviews['pageview_buy'] = df_pageviews['url2'].isin(buy) # purchase page

    # cart page but not purchase page
    df_pageviews.loc[(df_pageviews['url1'].isin(cart)) & (df_pageviews['pageview_buy'] == False), 'pageview_cart'] = True # cart
    
    # transformation NaN to 0 and float to int
    numeric_values = [
        'pageview_non_product',
        'pageview_search',
        'pageview_buy',
        'pageview_cart'
    ]
    for column in numeric_values:
        df_pageviews[column] = df_pageviews[column].fillna(0)
        df_pageviews[column] = df_pageviews[column].astype(int)

    # check if product page and empty url2 => category page
    df_pageviews.loc[
        (df_pageviews['pageview_non_product'] == False) & 
        (df_pageviews['pageview_cart'] == False) & 
        (df_pageviews['url2'].isnull()) & 
        (df_pageviews['url1'].notna()), 'pageview_category'] = True

    # check if product page and not empty url2 => product page
    df_pageviews.loc[
            (df_pageviews['pageview_non_product'] == False) & 
            (df_pageviews['pageview_buy'] == False) & 
            (df_pageviews['pageview_cart'] == False) & 
            (df_pageviews['url2'].notna()) & 
            (df_pageviews['url1'].notna()), 'pageview_product'] = True

    # if url2 starts 'strana-' or '?', it's not product page, but category page / filter
    df_pageviews.loc[
        (df_pageviews['pageview_product'] == True) & 
        (df_pageviews['pageview_search'] == False) & 
        (df_pageviews['url2'].str.startswith(pagination) | 
         df_pageviews['url2'].str.startswith(filtering)), 'pageview_filter'] = True

    df_pageviews.loc[(df_pageviews['pageview_filter'] == True), 'pageview_category'] = True

    # check if product page and not empty url2 => product page
    df_pageviews.loc[
        (df_pageviews['pageview_product'] == True) & 
        (df_pageviews['url2'].str.startswith(pagination) | 
         df_pageviews['url2'].str.startswith(filtering)), 'pageview_product'] = False

    
if (project == 'pp'):
    category_pages = '16-kupit-peptidy|16-buy-peptides'
    product_detail_pages = [
        'kupit-peptidy',
        'buy-peptides',
    ]
    search_pages = 'search|vyhladavanie'
    buy_pages = 'potvrdenie-objednavky|order-confirmation'
    cart_pages = 'objednavka|order'
    filtering = '?'
    
    df_pageviews['pageview_product'] = df_pageviews['url2'].isin(product_detail_pages)
    df_pageviews['pageview_category'] = df_pageviews['url2'].str.contains(category_pages, regex=True)
    df_pageviews['pageview_search'] = df_pageviews['url2'].str.contains(search_pages, regex=True)
    df_pageviews['pageview_buy'] = df_pageviews['url2'].str.contains(buy_pages, regex=True)
    df_pageviews.loc[
        (df_pageviews['pageview_product'] == False) & 
        (df_pageviews['pageview_category'] == False) & 
        (df_pageviews['pageview_search'] == False) & 
        (df_pageviews['pageview_buy'] == False) & 
        (df_pageviews['url2'].str.contains(cart_pages, regex=True)), 'pageview_cart'] = True
    
    # transformation NaN to 0 and float to int
    numeric_values = [
        'pageview_category',
        'pageview_product',
        'pageview_search',
        'pageview_buy',
        'pageview_cart'
    ]

    for column in numeric_values:
        df_pageviews[column] = df_pageviews[column].fillna(0)
        df_pageviews[column] = df_pageviews[column].astype(int)

    df_pageviews.loc[
        (df_pageviews['pageview_product'] == False) & 
        (df_pageviews['pageview_category'] == False) & 
        (df_pageviews['pageview_buy'] == False) &
        (df_pageviews['pageview_cart'] == False) & 
        (df_pageviews['pageview_search'] == False), 'pageview_non_product'] = True 
    df_pageviews.loc[
        (df_pageviews['pageview_category'] == True) & 
        (df_pageviews['url2'].str.contains(filtering, regex=False)), 'pageview_filter'] = True  

    
if (project == 'ba'):
    category_page = 'kategoria' # url1, category is in url2, product is in url3
    search_page = 'vyhladavanie' # url1
    cart_page = 'nakupny-kosik' # url1
    buy_page = '&objednane' # url2
    filtering = '#' # url3
    
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(category_page)) & 
        (df_pageviews['url3'].isnull()), 'pageview_category'] = True 
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(category_page)) & 
        (df_pageviews['url3'].str.startswith(filtering)), 'pageview_category'] = True 
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(category_page)) & 
        (df_pageviews['url3'].str.startswith(filtering)), 'pageview_filter'] = True 
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(category_page)) & 
        (df_pageviews['url3'].notna()) & 
        (df_pageviews['pageview_filter'].isnull()), 'pageview_product'] = True
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(cart_page)) & 
        (df_pageviews['url2'].str.contains(buy_page, regex=False)), 'pageview_buy'] = True
    df_pageviews.loc[
        (df_pageviews['url1'].str.contains(cart_page)) & 
        (df_pageviews['pageview_buy'].isnull()), 'pageview_cart'] = True
    df_pageviews['pageview_search'] = df_pageviews['url1'].str.contains(search_page)
    
    # transformation NaN to 0 and float to int
    numeric_values = [
        'pageview_category',
        'pageview_product',
        'pageview_search',
        'pageview_buy',
        'pageview_cart'
    ]

    for column in numeric_values:
        df_pageviews[column] = df_pageviews[column].fillna(0)
        df_pageviews[column] = df_pageviews[column].astype(int)

    df_pageviews.loc[
        (df_pageviews['pageview_product'] == False) & 
        (df_pageviews['pageview_category'] == False) & 
        (df_pageviews['pageview_buy'] == False) & 
        (df_pageviews['pageview_cart'] == False) & 
        (df_pageviews['pageview_search'] == False), 'pageview_non_product'] = True 

In [8]:
numeric_values = [
    'pageview_non_product', 'pageview_category', 'pageview_product',
    'pageview_filter', 'pageview_search', 'pageview_buy', 'pageview_cart'
]

for column in numeric_values:
    df_pageviews[column] = df_pageviews[column].fillna(0)
    df_pageviews[column] = df_pageviews[column].astype(int)

In [9]:
# number of pageviews of each type
# df_session['n_buy'] = number of records where df_pageviews['session_id'] == df_sessions['session_id'] and df_pageviews['pageview_buy'] == True

def num_url_per_session(column):
    group = df_pageviews.groupby('session_id')
    
    temp = pd.DataFrame()
    temp = group['pageview_' + column].sum() # sum pageview_buy per session 
    temp = temp.to_frame()
    temp.rename(columns={"pageview_" + column: "n_" + column}, inplace=True) 

    return pd.merge(df_sessions, temp, left_on='session_id', right_index=True, how='left')

df_sessions = num_url_per_session('buy') 
# number of purchase in session should be 1, some are 3 - redirecting from payment gateway

In [10]:
df_sessions.loc[df_sessions['n_buy'] > 0, 'buy'] = True
df_sessions.loc[df_sessions['n_buy'] == 0, 'buy'] = False

In [11]:
df_users = df_sessions.groupby(['user_id','session_id'])[['n_buy']].sum()

In [12]:
df_users.loc[df_users['n_buy'] > 0, 'buy'] = 1
df_users.loc[df_users['n_buy'] == 0, 'buy'] = 0

In [13]:
df_users.sort_values('n_buy').tail(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,n_buy,buy
user_id,session_id,Unnamed: 2_level_1,Unnamed: 3_level_1
69ff1120-9f6d-418c-86ce-6f83e9345c1b,86471e30-737f-11ea-a6e2-1936daa6dab8,5,1.0
6b08069f-98ca-4a92-9e73-0db31d6b3069,1923e0d0-ba16-11ea-ad53-01ccbad19567,6,1.0
6a901d8b-aaf5-438f-b7d0-51739e4e4904,d64620d0-63fc-11ea-88ee-50da9b64772a,6,1.0
235f9eb1-d5af-4d79-931f-ca6bd60c5e33,26a06880-3b19-11eb-a350-0372136353fd,7,1.0
60ffb221-870b-4b75-886b-ff66c733cac9,f8e612a0-5b8f-11ea-bef4-61282fb7bea0,9,1.0


In [14]:
len(df_users.groupby(level='user_id')) # number of unique users

3056

In [15]:
len(df_users[df_users['buy'] > 0].groupby(level='user_id'))

3044

In [16]:
len(df_users[df_users['buy'] == 0].groupby(level='user_id'))

656

In [17]:
users = df_users.reset_index()

In [18]:
purchase = users[users['buy'] > 0]['user_id'].unique()
no_purchase = users[users['buy'] == 0]['user_id'].unique()

In [19]:
# set(no_purchase.flat) - set(purchase.flat)

In [20]:
# set(no_purchase).intersection(set(purchase)) 

In [21]:
buy_count = users.set_index('user_id').buy.eq(1).sum(level=0).astype(int).reset_index()

In [22]:
buy_count[buy_count['buy'] > 1]

Unnamed: 0,user_id,buy
13,00e97892-aa84-4bd8-8903-29bd06a63036,2
26,01ba2b56-4ff8-4e08-9ad6-1b45020ce320,2
59,05499451-9cf0-4737-bada-ef97fd58ac71,2
66,05c83e42-41ed-43e3-be21-da1dbdc65118,2
91,07a5d388-102b-4b24-9eca-0aca95d5228f,2
...,...,...
3007,fb54a7a2-c014-4bff-b7b7-5a6c0ef76dc4,11
3022,fcafb453-4945-4c8e-a425-2b9007d065f7,4
3025,fd136686-5434-41ed-bee7-d55f84b0d129,3
3030,fd510011-6be0-44e5-9abd-17f62a012ad6,3


In [23]:
df_sessions = num_url_per_session('product') # n_product - number of products visited per session
df_sessions = num_url_per_session('non_product') # n_non_product
df_sessions = num_url_per_session('category') # n_category 
df_sessions = num_url_per_session('filter') # n_filter
df_sessions = num_url_per_session('search') # n_search
df_sessions = num_url_per_session('cart') # n_cart

In [24]:
def num_per_session(dataframe, table):
    temp = pd.DataFrame(dataframe["session_id"].value_counts()) 
    temp.rename(columns={"session_id": "n_" + table}, inplace=True)
    
    return pd.merge(df_sessions, temp, left_on='session_id', right_index=True, how='left')

In [25]:
df_sessions = num_per_session(df_pageviews, 'pageviews')
df_sessions = num_per_session(df_input, 'input')
df_sessions = num_per_session(df_load, 'load')
df_sessions = num_per_session(df_click, 'click')
df_sessions = num_per_session(df_mouse_click, 'mouse_click')
df_sessions = num_per_session(df_rage_click, 'rage_click')
df_sessions = num_per_session(df_mouse_move, 'mouse_move')
df_sessions = num_per_session(df_scroll_move, 'scroll_move')
df_sessions = num_per_session(df_wild_mouse, 'wild_mouse')
df_sessions = num_per_session(df_scrandom, 'scrandom')

In [26]:
numeric_values = [
    'n_pageviews', 'n_input', 'n_load', 'n_click', 'n_mouse_click', 'n_rage_click',
    'n_mouse_move', 'n_scroll_move', 'n_wild_mouse', 'n_scrandom'
]

for column in numeric_values:
    df_sessions[column] = df_sessions[column].fillna(0)
    df_sessions[column] = df_sessions[column].astype(int)

In [27]:
# number of events per session 
df_sessions['n_events'] = df_sessions['n_input'] + df_sessions['n_load'] + \
            df_sessions['n_click'] + df_sessions['n_mouse_click'] + df_sessions['n_rage_click'] + \
            df_sessions['n_mouse_move'] + df_sessions['n_scroll_move'] + \
            df_sessions['n_wild_mouse'] + df_sessions['n_scrandom']

In [28]:
# top_product (str) - the most visited product per session
# n_top_product (int) - number of visits of the most visited product per session 
# n_unique_product (int) - number of unique products per session

In [29]:
if (project == 'dp' or project == 'he'):
    url_name = 'url2'
elif (project == 'pp' or project == 'ba'):
    url_name = 'url3'

In [30]:
df_pageviews_product = df_pageviews[df_pageviews['pageview_product'] == True][['session_id', url_name]]

In [31]:
# select mode
if (project == 'dp' or project == 'he' or project == 'ba'):
    sessions_top_product = df_pageviews_product.groupby(['session_id'])[url_name].apply(lambda x: pd.Series.mode(x)[0]).to_frame()
elif (project == 'pp'):
    sessions_top_product = df_pageviews_product.groupby(['session_id'])[url_name].agg(pd.Series.mode).to_frame()

In [32]:
sessions_top_product.rename(columns={url_name: 'top_product'}, inplace=True)

df_sessions = pd.merge(
    df_sessions,
    sessions_top_product,
    on='session_id',
    how='left'
)

In [33]:
sessions_n_top_product = df_pageviews_product.groupby(['session_id', url_name])[url_name].count().to_frame()
sessions_n_top_product.rename(columns={url_name: 'n_top_product'}, inplace=True)
sessions_n_top_product = sessions_n_top_product.sort_values(by=['n_top_product'], ascending=False)

df_sessions = pd.merge(
    df_sessions,
    sessions_n_top_product,
    on='session_id',
    how='left'
)

df_sessions.drop_duplicates('session_id', keep='first', inplace=True)
df_sessions = df_sessions.reset_index(drop=True)

In [34]:
sessions_n_unique_product = df_pageviews_product.groupby(['session_id'])[url_name].nunique().to_frame()
sessions_n_unique_product.rename(columns={url_name: 'n_unique_product'}, inplace=True)

df_sessions = pd.merge(
    df_sessions,
    sessions_n_unique_product,
    on='session_id',
    how='left'
)

In [35]:
# top_category (str) - the most visited category per session
# n_top_category (int) - number of visits of the most visited category per session 
# n_unique_category (int) - number of unique categories per session

In [36]:
if (project == 'dp' or project == 'he'):
    url_name = 'url1'
elif (project == 'pp' or project == 'ba'): 
    url_name = 'url2'

In [37]:
df_pageviews_category = df_pageviews[df_pageviews['pageview_category'] == True][['session_id', url_name]]

In [38]:
sessions_top_category = df_pageviews_category.groupby(['session_id'])[url_name].apply(lambda x: pd.Series.mode(x)[0]).to_frame()
sessions_top_category.rename(columns={url_name: 'top_category'}, inplace=True)

df_sessions = pd.merge(
    df_sessions,
    sessions_top_category,
    on='session_id',
    how='left'
)

In [39]:
sessions_n_top_category = df_pageviews_category.groupby(['session_id', url_name])[url_name].count().to_frame()
sessions_n_top_category.rename(columns={url_name: 'n_top_category'}, inplace=True)
sessions_n_top_category = sessions_n_top_category.sort_values(by=['n_top_category'], ascending=False)

df_sessions = pd.merge(
    df_sessions,
    sessions_n_top_category,
    on='session_id',
    how='left'
)

df_sessions.drop_duplicates('session_id', keep='first', inplace=True)
df_sessions = df_sessions.reset_index(drop=True)

In [40]:
sessions_n_unique_category = df_pageviews_category.groupby(['session_id'])[url_name].nunique().to_frame()
sessions_n_unique_category.rename(columns={url_name: 'n_unique_category'}, inplace=True)

df_sessions = pd.merge(
    df_sessions,
    sessions_n_unique_category,
    on='session_id',
    how='left'
)

In [41]:
df_sessions['referrer1'] = df_sessions['referrer'].str.extract(r'(\b(?<=:\/\/)\b\S+)') # referrer1 - everything behind ://
df_sessions['referrer2'] = df_sessions['referrer1'].str.extract(r'(\b(?<=\/)\b\S+[^\/])') # 2. part from referrer1 without / at the end
df_sessions['referrer3'] = df_sessions['referrer2'].str.extract(r'(\b(?<=\/)\b\S+[^\/])') # 2. part from referrer2 without / at the end

df_sessions['referrer1'] = df_sessions['referrer1'].str.extract(r'([^\/]*)') # 1. part from referrer1
df_sessions['referrer2'] = df_sessions['referrer2'].str.extract(r'([^\/]*)') # 1. part from referrer2

In [42]:
df_pageviews['referrer1'] = df_pageviews['referrer'].str.extract(r'(\b(?<=:\/\/)\b\S+)') # referrer1 - everything behind ://
df_pageviews['referrer2'] = df_pageviews['referrer1'].str.extract(r'(\b(?<=\/)\b\S+[^\/])') # 2. part from referrer1 without / at the end
df_pageviews['referrer3'] = df_pageviews['referrer2'].str.extract(r'(\b(?<=\/)\b\S+[^\/])') # 2. part from referrer2 without / at the end

df_pageviews['referrer1'] = df_pageviews['referrer1'].str.extract(r'([^\/]*)') # 1. part from referrer11
df_pageviews['referrer2'] = df_pageviews['referrer2'].str.extract(r'([^\/]*)') # 1. part from referrer2

In [43]:
# time when session starts
df_sessions['hour'] = pd.to_datetime(df_sessions['started_at'], unit='ms').dt.hour
df_sessions['day_name'] = pd.to_datetime(df_sessions['day']).dt.day_name()

In [44]:
df_pageviews_buy = df_pageviews[df_pageviews['pageview_buy'] == True]

df_pageviews_buy.loc[:,'buy_hour'] = pd.to_datetime(df_pageviews_buy['started_at'], unit='ms').dt.hour
df_pageviews_buy.loc[:,'buy_day'] = pd.to_datetime(df_pageviews_buy['started_at'], unit='ms').dt.day_name()

df_pageviews_buy = df_pageviews_buy.sort_values(by=['started_at'])

df_sessions = pd.merge(
    df_sessions,
    df_pageviews_buy[['session_id', 'buy_hour', 'buy_day']],
    on='session_id',
    how='left'
)

df_sessions.drop_duplicates('session_id', keep='first', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [45]:
# 0 - other city,  1 - Bratislava,  2 - Košice, 3 - other regional city
city_BA = 'Bratislava'
city_KE = 'Košice'
city_regional = ['Trnava', 'Žilina', 'Nitra', 'Prešov', 'Banská Bystrica', 'Trenčín']

df_sessions.loc[df_sessions['city'] == city_BA, 'city_type'] = 1 
df_sessions.loc[df_sessions['city'] == city_KE, 'city_type'] = 2 
df_sessions.loc[df_sessions['city'].isin(city_regional), 'city_type'] = 3 

In [46]:
# pageview load speed
df_load.rename(columns={'at': 'load_time'}, inplace=True)

df_pageviews = pd.merge(
    df_pageviews,
    df_load[['pageview_id', 'load_time']],
    on='pageview_id',
    how='left'
)

df_pageviews.drop_duplicates('pageview_id', keep='first', inplace=True)
df_pageviews = df_pageviews.reset_index(drop=True)
df_load.rename(columns={'load_time': 'at'}, inplace=True)

In [47]:
# !. pageview load speed per session
df_pageviews = df_pageviews.sort_values(by=['started_at'])

df_sessions = pd.merge(
    df_sessions,
    df_pageviews[['session_id', 'load_time']],
    on='session_id',
    how='left'
)

df_sessions.drop_duplicates('session_id', keep='first', inplace=True)
df_sessions = df_sessions.reset_index(drop=True)

In [48]:
# session duration
session_duration = df_pageviews.groupby('session_id')[['duration']].sum()

df_sessions = pd.merge(
    df_sessions,
    session_duration,
    on='session_id',
    how='left'
)

In [49]:
numeric_values = [
    'n_product', 'n_non_product',
    'n_category', 'n_search', 'n_buy',
    'load_time',
    'n_top_product', 'n_top_category',
    'n_unique_product', 'n_unique_category',
    'city_type',
]

for column in numeric_values:
    df_sessions[column] = df_sessions[column].fillna(0)
    df_sessions[column] = df_sessions[column].astype(int)
    
string_values = [
    'continent', 'region', 'country', 'city',
    'browser.name', 'device.type', 'os.name',
    'referrer1', 'referrer2', 'referrer3',
    'top_product', 'top_category',
    'buy_hour', 'buy_day', 'day_name'
]

for column in string_values:
    df_sessions[column].fillna('-', inplace = True)

In [50]:
# repair session (Herbatica)

In [51]:
# pageviews_buy, where referrer1 = 'gate.gopay.cz', 'gopay.cz' & n_pageviews <= 2 - delete
df_pageviews_buy['referrer1'].unique() # 'gate.gopay.cz', 'gopay.cz' is redirection from payment

array(['particlepeptides.com', nan], dtype=object)

In [52]:
# df_pageviews_buy['referrer3'].unique() # krok-2 => correct redirection to purchase completion page

In [53]:
df_pageviews_buy['url3'].unique() # url3 should be NaN

array([nan], dtype=object)

In [54]:
# group by number of pageview_buy per session + list of purchase session with 1 pageview_buy
n_pv_buy = df_pageviews_buy.groupby('session_id')['pageview_buy'].sum().reset_index()
sessions_1_pv_buy = n_pv_buy[n_pv_buy['pageview_buy'] == 1]['session_id'].unique()

In [55]:
# list of pageview_buy where referrer = gopay and number of pageview_buy in session = 1 
pv_buy_1 = df_sessions[(df_sessions['session_id'].isin(sessions_1_pv_buy) & (df_sessions['referrer1'].isin(['gate.gopay.cz', 'gopay.cz'])))]['session_id'].unique()

In [56]:
sessions_fake_buy = df_sessions[(df_sessions['session_id'].isin(pv_buy_1)) & (df_sessions['n_pageviews'] < 3)]['session_id'].unique()
len(sessions_fake_buy)

0

In [57]:
# remove sessions with fake purchase
df_sessions = df_sessions[~df_sessions['session_id'].isin(sessions_fake_buy)]

In [58]:
# sessions, where referrer = gopay and more than 2 pageviews
sessions = df_sessions[df_sessions['session_id'].isin(pv_buy_1)]['session_id'].unique()

In [59]:
for s in sessions:
    df_pageviews = df_pageviews[~((df_pageviews['session_id'] == s) & (df_pageviews['referrer1'].isin(['gate.gopay.cz', 'gopay.cz'])))]
    df_sessions.loc[df_sessions['session_id'] == s, 'n_buy'] = 0
    df_sessions.loc[df_sessions['session_id'] == s, 'buy'] = False
    df_sessions.loc[df_sessions['session_id'] == s, 'buy_hour'] = '-'
    df_sessions.loc[df_sessions['session_id'] == s, 'buy_day'] = '-'    

### DF_USERS

In [60]:
# create df_users
df_users = df_sessions.groupby(['user_id','session_id'])[['n_buy']].sum()

In [61]:
df_users.loc[df_users['n_buy'] > 0, 'buy'] = 1
df_users.loc[df_users['n_buy'] == 0, 'buy'] = 0

In [62]:
# sum of values per sessions
temp = df_sessions.groupby(['user_id','session_id'])[
    ['n_pageviews', 'n_input', 'n_load', 'n_click', 'n_mouse_click', 'n_rage_click',
     'n_mouse_move','n_scroll_move','n_wild_mouse','n_scrandom',
     'n_events', 'n_product', 'n_non_product', 'n_category', 'n_filter', 'n_search', 'n_cart', 
     'effective_duration', 'duration']].sum()

In [63]:
df_users = pd.merge(
    df_users, 
    temp, 
    left_on=['user_id','session_id'],
    right_index=True, 
    how='left'
)

In [64]:
# number of sessions per user
temp = pd.DataFrame(df_sessions['user_id'].value_counts()) 
temp.rename(columns={'user_id': 'n_sessions'}, inplace=True) 

df_users = pd.merge(
    df_users, 
    temp, 
    left_on='user_id', 
    right_index=True, 
    how='left'
)

In [65]:
df_sessions = df_sessions.sort_values(by=['started_at'])

df_users = pd.merge(
    df_users,
    df_sessions[['user_id', 'session_id','country', 'city', 'continent', 'started_at', 'region', 'day', 'device.type', 'browser.name', 'os.name', 'referrer1', 'referrer2', 'referrer3', 'hour', 'day_name', 'city_type', 'load_time']],
    on=['user_id','session_id'],
    how='left'
)

df_users.drop_duplicates('session_id', keep='first', inplace=True)

In [66]:
len(df_users['user_id'].unique())

3056

In [67]:
# time and day 1. pageview with completed purchase
df_pageviews_buy = df_pageviews[df_pageviews['pageview_buy'] == True]

df_pageviews_buy.loc[:,'buy_hour'] = pd.to_datetime(df_pageviews_buy['started_at'], unit='ms').dt.hour # transform timestamp to date time and select hour
df_pageviews_buy.loc[:,'buy_day'] = pd.to_datetime(df_pageviews_buy['started_at'], unit='ms').dt.day_name() # transform timestamp to date time and select day

df_pageviews_buy = df_pageviews_buy.sort_values(by=['started_at'])

In [68]:
df_users = pd.merge(
    df_users,
    df_pageviews_buy[['user_id', 'session_id','buy_hour', 'buy_day']],
    on=['user_id','session_id'],
    how='left'
)

In [69]:
df_users.drop_duplicates('session_id', keep='first', inplace=True)

In [70]:
# remove users where n_events = 0 and pageviews < 2
df_users = df_users[df_users['n_events'] != 0]
df_users = df_users[df_users['n_pageviews'] > 1]

In [71]:
# number of purchase sessions for each user
buy_count = df_users.set_index('user_id').buy.eq(1).sum(level=0).astype(int).reset_index()
buy_count = buy_count.rename(columns={'buy': 'buy_count'})

In [72]:
df_users = pd.merge(
    df_users,
    buy_count,
    on='user_id',
    how='left'
)

In [73]:
len(df_users['user_id'].unique())

2665

In [74]:
df_users = df_users.sort_values('started_at')

In [75]:
df_users['mean_n_ses_between_buys'] = '-'
df_users['mean_time_between_buys'] = '-'

In [76]:
for user in df_users['user_id'].unique():
    find = False
    i = 0
    day_1 = 0
    day_2 = 0
    zeros = list()
    for session in df_users[df_users['user_id'] == user]['session_id']:
        buy = df_users[df_users['session_id'] == session]['buy'].values[0]
        if find == False and buy == 1:
            day_1 = datetime.strptime(df_users[df_users['session_id'] == session]['day'].values[0], "%Y-%m-%d") 
            find = True
        elif find == True and buy == 0:
            i += 1
        elif find == True and buy == 1:            
            day_2 = datetime.strptime(df_users[df_users['session_id'] == session]['day'].values[0], "%Y-%m-%d")
            zeros.append(i)
            i = 0
    df_users.loc[df_users['user_id'] == user, 'mean_n_ses_between_buys'] = sum(zeros)/len(zeros) if zeros else 0
    df_users.loc[df_users['user_id'] == user, 'mean_time_between_buys'] = abs((day_2 - day_1).days) if (day_2 != 0 and day_1 != 0) else 0

In [77]:
df_users['repeat_buyer'] = np.where(df_users['buy_count'] > 1, True, False)

In [78]:
len(df_users[df_users['repeat_buyer'] == 1]['user_id'].unique())

205

In [79]:
len(df_users['user_id'].unique())

2665

In [80]:
# number of sessions between 1. and 2. purchase , if !2.purchase, select all sessions
df_users = df_users.sort_values('started_at')

In [81]:
df_users

Unnamed: 0,user_id,session_id,n_buy,buy,n_pageviews,n_input,n_load,n_click,n_mouse_click,n_rage_click,n_mouse_move,n_scroll_move,n_wild_mouse,n_scrandom,n_events,n_product,n_non_product,n_category,n_filter,n_search,n_cart,effective_duration,duration,n_sessions,country,city,continent,started_at,region,day,device.type,browser.name,os.name,referrer1,referrer2,referrer3,hour,day_name,city_type,load_time,buy_hour,buy_day,buy_count,mean_n_ses_between_buys,mean_time_between_buys,repeat_buyer
578,1684b3ff-5f8d-4ebd-94cf-76f77a4abc0f,61f25b20-4e73-11ea-b4c4-82e0ca2d49be,1,1.0,9,3,9,6,10,0,51,10,0,0,89,0,2,1,0,0,5,32015.0,125164.0,172,SK,Košice,EU,1581606809026,KI,2020-02-13,desktop,Firefox,Windows,-,-,-,15,Thursday,2,1315,15.0,Thursday,2,28,102,True
3395,8d4a4937-efdd-4a3d-8d4e-4c41182fe8ed,6d453880-4e73-11ea-a1f9-aa2f16c14ecb,1,1.0,7,2,6,1,4,0,29,15,0,0,57,0,1,0,0,0,5,13888.0,4159316.0,1,CZ,Olomouc,EU,1581606817071,71,2020-02-13,desktop,Firefox,Windows,particlepeptides.com,en,order,15,Thursday,0,0,15.0,Thursday,1,0,0,False
1843,44f25179-3952-45c2-b30b-ec718d990b17,db0c08b0-4e84-11ea-bd80-ad8d00e50b45,0,0.0,4,1,4,5,4,0,0,78,0,0,92,1,0,1,0,2,0,32510.0,187114.0,40,CZ,Stradonice,EU,1581614306953,20,2020-02-13,mobile,Google Search,iOS,-,-,-,17,Thursday,0,10922,,,2,2,27,True
1831,44ea6ae5-54fc-4ac4-ba7a-c9b28dc91faf,38898d50-4e8f-11ea-9ad7-8f10274d5a3b,0,0.0,3,0,3,16,17,0,0,18,0,0,54,0,2,1,0,0,0,11446.0,109947.0,25,SK,Nitra,EU,1581618759881,NI,2020-02-13,mobile,Chrome,Android,-,-,-,18,Thursday,3,17046,,,1,0,0,False
2224,588a6d13-75ce-4ef5-a543-7c1bff6eaed9,ebeb9210-4e91-11ea-87ce-ff613faa86ec,0,0.0,3,1,2,2,3,0,0,7,0,0,15,0,0,0,0,0,3,3961.0,43665.0,21,CZ,Ricany,EU,1581619822327,20,2020-02-13,mobile,Chrome,Android,-,-,-,18,Thursday,0,0,,,1,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4268,bb641fc5-0a01-4cc6-a09d-d0d2c5a13276,938f1980-d43b-11eb-9af1-0c548914e2ec,1,1.0,17,9,16,37,42,0,0,113,0,15,232,1,3,3,0,0,9,146633.0,835506.0,1,SK,Bratislava,EU,1624463790682,BL,2021-06-23,mobile,Samsung Internet,Android,www.google.com,-,-,15,Wednesday,1,0,16.0,Wednesday,1,0,0,False
4139,b4b0042a-22b9-43e6-9b9f-eb1e44cd04d8,683e4170-d4bd-11eb-9209-d91894e91e5c,2,1.0,10,26,10,62,78,4,0,220,0,19,419,1,1,0,0,0,6,128832.0,710671.0,1,SK,Rajecke Teplice,EU,1624519550729,ZI,2021-06-24,mobile,Chrome,Android,-,-,-,7,Thursday,0,5365,7.0,Thursday,1,0,0,False
2420,6355b371-c59e-48a5-90e4-caa69cd19984,eba22770-d4f9-11eb-9877-607024b94d4e,1,1.0,22,23,20,38,53,0,0,154,0,6,294,0,6,1,0,5,9,109188.0,2001453.0,1,SK,Bratislava,EU,1624545540260,BL,2021-06-24,mobile,Chrome,Android,-,-,-,14,Thursday,1,0,14.0,Thursday,1,0,0,False
2668,6eb86cfe-bc02-469e-b399-ec6d242d0620,6a8b12d0-d519-11eb-9634-3ee9858b1072,1,1.0,39,6,38,64,72,1,0,467,0,46,694,8,7,5,0,0,18,262872.0,1418043.0,1,CZ,Prague,EU,1624559068111,10,2021-06-24,mobile,Chrome,Android,-,-,-,18,Thursday,0,0,18.0,Thursday,1,0,0,False


In [82]:
# average number of days between 2 buys
print(df_users[df_users['day'] > "2020-06-02"]['mean_time_between_buys'].mean())
# median number of days between 2 buys
print(df_users[df_users['mean_time_between_buys'] > 0]['mean_time_between_buys'].median())
# min number of days between 2 buys
print(df_users[df_users['mean_time_between_buys'] > 1]['mean_time_between_buys'].min())
# max number of days between 2 buys
print(df_users[df_users['mean_time_between_buys'] < 362]['mean_time_between_buys'].max())

25.866370945383977
55.0
2
254


In [None]:
d1 = datetime.strptime(df_users[df_users['session_id'] == session]['day'].values[0], "%Y-%m-%d") 

In [None]:
df_users[(df_users['user_id'] == '1684b3ff-5f8d-4ebd-94cf-76f77a4abc0f') & (df_users['buy'] == 1)]

In [None]:
df_users[df_users['user_id'] == 'fc01d19f-a1a0-4bca-8ec0-a0bff986e7e3']

In [None]:
df_users[(df_users['mean_time_between_buys'] == 464) & (df_users['buy'] == 1)]

In [None]:
df_users_2 = pd.DataFrame()
for user in df_users['user_id'].unique():
    find = False
    for session in df_users[df_users['user_id'] == user]['session_id']:
        buy = df_users[df_users['session_id'] == session]['buy'].values[0]
        if find == False and buy == 1:
            find = True
            df_users_2 = df_users_2.append(df_users[df_users['session_id'] == session])
        elif find == True and buy == 0:
            df_users_2 = df_users_2.append(df_users[df_users['session_id'] == session])
        elif find == True and buy == 1: 
            df_users_2 = df_users_2.append(df_users[df_users['session_id'] == session])
            break

In [None]:
len(df_users_2['user_id'].unique())

In [None]:
# number of sessions between 1. and 2. purchase 
# number of sessions after 1. purchase 
n_sessions_after_1_buy = df_users_2.groupby('user_id')['session_id'].count().astype(int).reset_index()
n_sessions_after_1_buy.rename(columns={'session_id' : "n_sessions_after_1_buy"}, inplace = True)

In [None]:
df_users_2 = pd.merge(
    df_users_2,
    n_sessions_after_1_buy,
    on='user_id',
    how='left'
)

In [None]:
len(df_users_2['user_id'].unique())

In [None]:
df_users_2 = df_users_2[df_users_2['n_sessions_after_1_buy'] > 1]
df_users_2['mean_n_ses_between_buys'] = df_users_2['mean_n_ses_between_buys'].astype(float)

In [None]:
len(df_users_2['user_id'].unique())

In [None]:
# number of sessions after 1. purchase 
counter = collections.Counter(round(df_users_2[df_users_2['mean_n_ses_between_buys'] > 0 ]['mean_n_ses_between_buys']))
print(counter)

labels = counter.keys()
sizes = counter.values()

fig1, ax1 = plt.subplots()
patches, texts = ax1.pie(sizes, labels=labels, startangle=90)
plt.legend(labels, loc="best")
ax1.axis('equal')

plt.show()

In [None]:
# cut number of sessions, max avg session count between 2 purchases after 1. purchase
session_limit = round(df_users_2[df_users_2['mean_n_ses_between_buys'] > 0 ]['mean_n_ses_between_buys'].mean())
session_limit

In [None]:
temp = pd.DataFrame()

In [None]:
# selection_method = 1 # session with 1. purchase and sessions without purchase
selection_method = 2 # also session with 2. purchase

In [None]:
if selection_method == 1:
    # select sessions after 1. purchase, if session count < limit + 1, or limit + 2 - select all sessions
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] <= (session_limit + 1))].sort_values('started_at',ascending=True), ignore_index=False)
    df = df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] <= (session_limit + 2))].sort_values('started_at',ascending=True)
    temp = temp.append(df.drop(df.groupby('user_id').tail(1).index,axis=0), ignore_index=False)
    
    # select 1. purchase
    temp = temp.append(df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 2))].sort_values('started_at',ascending=True).groupby('user_id').head(1), ignore_index=False)
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 1))].sort_values('started_at',ascending=True).groupby('user_id').head(1), ignore_index=False)
        
    # select last x sessions (x = session limit)
    df = df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 2))].sort_values('started_at',ascending=False)
    df = df.drop(df.groupby('user_id').head(1).index,axis=0)
    temp = temp.append(df.groupby('user_id').head(session_limit), ignore_index=False)
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 1))].sort_values('started_at',ascending=False).groupby('user_id').head(session_limit), ignore_index=False)

if selection_method == 2:
    # select sessions after 1. purchase, if session count < limit + 1, or limit + 2 - select all sessions and 2.purchase
    temp = temp.append(df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] <= (session_limit + 2))].sort_values('started_at',ascending=True), ignore_index=False)
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] <= (session_limit + 1))].sort_values('started_at',ascending=True), ignore_index=False)

    # select 1. purchase
    temp = temp.append(df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 2))].sort_values('started_at',ascending=True).groupby('user_id').head(1), ignore_index=False)
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 1))].sort_values('started_at',ascending=True).groupby('user_id').head(1), ignore_index=False)

    # select last x sessions (x = session limit), or last sessions + 2.purchase
    temp = temp.append(df_users_2[(df_users_2['buy_count'] > 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 2))].sort_values('started_at',ascending=False).groupby('user_id').head(session_limit + 1), ignore_index=False)
    temp = temp.append(df_users_2[(df_users_2['buy_count'] <= 1) & (df_users_2['n_sessions_after_1_buy'] > (session_limit + 1))].sort_values('started_at',ascending=False).groupby('user_id').head(session_limit), ignore_index=False)


In [None]:
temp = temp.sort_values('started_at') 

In [None]:
df_users = temp
df_sessions = df_sessions[df_sessions['session_id'].isin(df_users_2['session_id'])]
df_pageviews = df_pageviews[df_pageviews['session_id'].isin(df_users_2['session_id'])]

In [None]:
len(df_users['user_id'].unique())

In [None]:
# number of visits of the most visited product and category per user and session
df_users = pd.merge(
    df_users,
    df_sessions[['session_id','top_product','n_top_product','n_unique_product','top_category','n_top_category','n_unique_category']],
    on='session_id',
    how='left'
)

In [None]:
# number of visits of the most visited product per user
if (project == 'dp' or project == 'he'):
    url_name = 'url2'
elif (project == 'pp' or project == 'ba'):
    url_name = 'url3'

In [None]:
df_pageviews_product = df_pageviews[df_pageviews['pageview_product'] == True][['user_id', url_name]] 

In [None]:
if (project == 'dp' or project == 'he' or project == 'ba'):
    users_top_product = df_pageviews_product.groupby(['user_id'])[url_name].apply(lambda x: pd.Series.mode(x)[0]).to_frame()
elif (project == 'pp'):
    users_top_product = df_pageviews_product.groupby(['user_id'])[url_name].agg(pd.Series.mode).to_frame()

In [None]:
users_top_product.rename(columns={url_name: 'top_product_u'}, inplace=True)

df_users = pd.merge(
    df_users,
    users_top_product,
    on='user_id',
    how='left'
)

In [None]:
users_n_top_product = df_pageviews_product.groupby(['user_id', url_name])[url_name].count().to_frame()
users_n_top_product.rename(columns={url_name: 'n_top_product_u'}, inplace=True)
users_n_top_product = users_n_top_product.sort_values(by=['n_top_product_u'], ascending=False)

df_users = pd.merge(
    df_users,
    users_n_top_product,
    on='user_id',
    how='left'
)

df_users.drop_duplicates('session_id', keep='first', inplace=True)
df_users = df_users.reset_index(drop=True)

In [None]:
users_n_unique_product = df_pageviews_product.groupby(['user_id'])[url_name].nunique().to_frame()
users_n_unique_product.rename(columns={url_name: 'n_unique_product_u'}, inplace=True)

df_users = pd.merge(
    df_users,
    users_n_unique_product,
    on='user_id',
    how='left'
)

In [None]:
# number of visits of the most visited category per user
if (project == 'dp' or project == 'he'):
    url_name = 'url1'
elif (project == 'pp' or project == 'ba'): # pp doesn't have categories
    url_name = 'url2'

In [None]:
df_pageviews_category = df_pageviews[df_pageviews['pageview_category'] == True][['user_id', url_name]]

In [None]:
users_top_category = df_pageviews_category.groupby(['user_id'])[url_name].apply(lambda x: pd.Series.mode(x)[0]).to_frame()
users_top_category.rename(columns={url_name: 'top_category_u'}, inplace=True)

df_users = pd.merge(
    df_users,
    users_top_category,
    on='user_id',
    how='left'
)

In [None]:
users_n_top_category = df_pageviews_category.groupby(['user_id', url_name])[url_name].count().to_frame()
users_n_top_category.rename(columns={url_name: 'n_top_category_u'}, inplace=True)
users_n_top_category = users_n_top_category.sort_values(by=['n_top_category_u'], ascending=False)

df_users = pd.merge(
    df_users,
    users_n_top_category,
    on='user_id',
    how='left'
)

df_users.drop_duplicates('session_id', keep='first', inplace=True)
df_users = df_users.reset_index(drop=True)

In [None]:
users_n_unique_category = df_pageviews_category.groupby(['user_id'])[url_name].nunique().to_frame()
users_n_unique_category.rename(columns={url_name: 'n_unique_category_u'}, inplace=True)

df_users = pd.merge(
    df_users,
    users_n_unique_category,
    on='user_id',
    how='left'
)

In [None]:
# number of add to cart clicks per session
if (project == 'he'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Pridať do košíka") | (df_click['args.text'] == "Do košíka") |
                                 (df_click['args.text'] == "Add to basket") | (df_click['args.text'] == "To basket") |
                                 (df_click['args.text'] == "Přidat do košíku") | (df_click['args.text'] == "Do košíku")]["session_id"].value_counts()) 
    temp.rename(columns={"session_id": "n_add_to_cart"}, inplace=True) 

if (project == 'pp'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Pridať do košíka") | (df_click['args.text'] == "Add to cart")]["session_id"].value_counts()) 
    temp.rename(columns={"session_id": "n_add_to_cart"}, inplace=True) 

if (project == 'ba'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Vložiť do košíka") | (df_click['args.text'] == "Do košíka")]["session_id"].value_counts()) 
    temp.rename(columns={"session_id": "n_add_to_cart"}, inplace=True)

df_sessions = pd.merge(df_sessions, temp, left_on='session_id', right_index=True, how='left')

df_users = pd.merge(
    df_users,
    df_sessions[['session_id','n_add_to_cart']],
    on='session_id',
    how='left'
)

df_users['n_add_to_cart'] = df_users['n_add_to_cart'].fillna(0)

In [None]:
# number of remove from cart clicks per session
if (project == 'he'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Odstrániť z košíka") | (df_click['args.text'] == "Odstranit z košíku")]["session_id"].value_counts()) 
    temp.rename(columns={"session_id": "n_remove_from_cart"}, inplace=True) 

if (project == 'pp'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Vymazať") | (df_click['args.text'] == "Delete")]["session_id"].value_counts())
    temp.rename(columns={"session_id": "n_remove_from_cart"}, inplace=True) 

if (project == 'ba'):
    temp = pd.DataFrame(df_click[df_click['args.text'] == "Odstrániť"]["session_id"].value_counts())
    temp.rename(columns={"session_id": "n_remove_from_cart"}, inplace=True) 

df_sessions = pd.merge(df_sessions, temp, left_on='session_id', right_index=True, how='left') 

df_users = pd.merge(
    df_users,
    df_sessions[['session_id','n_remove_from_cart']],
    on='session_id',
    how='left'
)

df_users['n_remove_from_cart'] = df_users['n_remove_from_cart'].fillna(0)

In [None]:
# number of next page clicks - only He, Ba - PP doesn't have next pages
pages = list(range(0, 500))
pages = [str(x) for x in pages]

if (project == 'he'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "Ďalšie produkty") | (df_click['args.text'] == "Ďalšia strana") |
                                (df_click['args.text'] == "Další produkty") | (df_click['args.text'] == "Další strana")]["session_id"].value_counts())
    temp.rename(columns={"session_id": "n_next_page"}, inplace=True) 


if (project == 'ba'):
    temp = pd.DataFrame(df_click[(df_click['args.text'] == "nasledujúca strana") | 
                                 (df_click['args.text'].isin(pages))]["session_id"].value_counts())
    temp.rename(columns={"session_id": "n_next_page"}, inplace=True) 

if (project == 'he') or (project == 'ba'):
    df_sessions = pd.merge(df_sessions, temp, left_on='session_id', right_index=True, how='left')

    df_users = pd.merge(
        df_users,
        df_sessions[['session_id','n_next_page']],
        on='session_id',
        how='left'
    )

    df_users['n_next_page'] = df_users['n_next_page'].fillna(0)

In [None]:
# transformation NaN to 0 and float to int
numeric_values = [
    'n_pageviews', 'n_input', 'n_load',
    'n_click', 'n_mouse_click',  'n_rage_click',
    'n_mouse_move', 'n_scroll_move', 'n_wild_mouse', 
    'n_scrandom',
    'n_events',
    'n_product', 'n_non_product', 'n_category',
    'n_filter', 'n_search',
    'n_cart', 'n_buy',
    'effective_duration', 'duration',
    'n_sessions',
    'hour',
    'city_type',
    'load_time',
    'n_top_product_u', 'n_unique_product_u',
    'n_top_category_u', 'n_unique_category_u',
    'n_add_to_cart', 'n_remove_from_cart', 
#     'n_next_page'
]

for column in numeric_values:
    df_users[column] = df_users[column].fillna(0)
    df_users[column] = df_users[column].astype(int)

In [None]:
# transformation NaN to '-' 
string_values = [
    'country', 'city', 'continent', 'region',
    'device.type', 'browser.name', 'os.name',
    'referrer1', 'referrer2', 'referrer3',
    'day_name', 
    'day',
    'buy_hour',
    'buy_day',
    'top_product_u', 'top_category_u',
]

for column in string_values:
    df_users[column].fillna('-', inplace = True)

In [None]:
len(df_users['user_id'].unique())

In [None]:
# export CSV 
# example filename: csv_he + df_users_metrics_ + 1 
df_users.to_csv(folder + '/df_users_metrics_' + str(selection_method) + '.csv', index=False, encoding='utf-8-sig')
df_sessions.to_csv(folder + '/df_sessions_metrics_' + str(selection_method) + '.csv', index=False, encoding='utf-8-sig')
df_pageviews.to_csv(folder + '/df_pageviews_metrics_' + str(selection_method) + '.csv', index=False, encoding='utf-8-sig')

# UNCHANGED
# df_inactivities
# df_input
# df_load
# df_click
# df_mouse_click
# df_rage_click
# df_mouse_move
# df_scroll_move
# df_wild_mouse
# df_scrandom