In [15]:
import numpy as np 
import pandas as pd
import collections # counter
import time
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE, ADASYN # oversampling
from copy import *
from statistics import mean

from sklearn import preprocessing
from sklearn.impute import SimpleImputer # handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler # encoding categorical data, feature scaling
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split # splitting training and testing data
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.datasets import make_classification
from sklearn.utils import column_or_1d
from sklearn.model_selection import KFold

In [16]:
# folder = 'csv_he'
# folder = 'csv_pp_500'
folder = 'csv_b_500'
# selection_method = 1
selection_method = 2

In [17]:
df_users = pd.read_csv(folder + '/df_users_transform_' + str(selection_method) + '.csv')

In [18]:
len(df_users)

904

In [19]:
len(df_users[df_users['repeat_buyer'] == 1])

186

In [20]:
len(df_users[df_users['repeat_buyer'] == 0])

718

In [21]:
# repeat buyers/nonrepeat buyers
len(df_users[df_users['repeat_buyer'] == 1]) / len(df_users[df_users['repeat_buyer'] == 0])

0.2590529247910863

In [None]:
labeled_column = 'repeat_buyer'
useful_columns = [
    'n_pageviews_buy', 'n_pageviews_no_buy',
    'n_input_buy', 'n_input_no_buy', 
    'n_click_buy', 'n_click_no_buy', 
    'n_mouse_click_buy', 'n_mouse_click_no_buy', 
    'n_mouse_move_buy', 'n_mouse_move_no_buy',
    'n_scroll_move_buy', 'n_scroll_move_no_buy', 
    'n_scrandom_buy', 'n_scrandom_no_buy', 
    'n_events_buy', 'n_events_no_buy', 
    'n_product_buy', 'n_product_no_buy',
    'n_non_product_buy', 'n_non_product_no_buy', 
    'n_category_buy', 'n_category_no_buy', 
    'n_filter_buy', 'n_filter_no_buy', 
    'n_search_buy', 'n_search_no_buy', 
    'n_cart_buy', 'n_cart_no_buy', 
    'n_add_to_cart_buy', 'n_add_to_cart_no_buy', 
    'n_remove_from_cart_buy', 'n_remove_from_cart_no_buy',
#     'n_next_page_buy', 'n_next_page_no_buy',
    'effective_duration_buy', 'effective_duration_no_buy', 
    'pv_product_mean_eff_duration_buy', 'pv_product_mean_eff_duration_no_buy',
    'user_mean_eff_duration',
    'load_time_buy', 'load_time_no_buy', 
    'num_of_sessions',
    'n_sessions_after_1_buy',
    'n_sessions',
    'top_product_u', 'n_top_product_u', 'n_unique_product_u',
    'top_category_u', 'n_top_category_u', 'n_unique_category_u',
    'country', 'city', 'city_type', 'continent', 'region', 
    'device.type', 'browser.name', 'os.name',
    'referrer1',
    'buy_time_1', 'time_1',
    'buy_time_2', 'time_2', 
    'buy_time_3', 'time_3', 
    'buy_day_1', 'day_1', 
    'buy_day_2', 'day_2',
    'mean_price', 
#     'mean_rate',  
]
numeric_columns = [
    'n_pageviews_buy', 'n_pageviews_no_buy',
    'n_input_buy', 'n_input_no_buy', 
    'n_click_buy', 'n_click_no_buy', 
    'n_mouse_click_buy', 'n_mouse_click_no_buy', 
    'n_mouse_move_buy', 'n_mouse_move_no_buy',
    'n_scroll_move_buy', 'n_scroll_move_no_buy', 
    'n_scrandom_buy', 'n_scrandom_no_buy', 
    'n_events_buy', 'n_events_no_buy', 
    'n_product_buy', 'n_product_no_buy',
    'n_non_product_buy', 'n_non_product_no_buy', 
    'n_category_buy', 'n_category_no_buy', 
    'n_filter_buy', 'n_filter_no_buy', 
    'n_search_buy', 'n_search_no_buy', 
    'n_cart_buy', 'n_cart_no_buy', 
    'n_add_to_cart_buy', 'n_add_to_cart_no_buy', 
    'n_remove_from_cart_buy', 'n_remove_from_cart_no_buy',
#     'n_next_page_buy', 'n_next_page_no_buy',
    'effective_duration_buy', 'effective_duration_no_buy', 
    'pv_product_mean_eff_duration_buy', 'pv_product_mean_eff_duration_no_buy',
    'user_mean_eff_duration',
    'num_of_sessions',
    'n_sessions_after_1_buy',
    'n_sessions',
    'load_time_buy', 'load_time_no_buy', 
    'n_top_product_u', 'n_unique_product_u',
    'n_top_category_u', 'n_unique_category_u',
    'mean_price', 
#     'mean_rate',
    'buy_time_1', 'time_1',
    'buy_time_2', 'time_2', 
    'buy_time_3', 'time_3', 
    'buy_day_1', 'day_1', 
    'buy_day_2', 'day_2',
]
removed_columns = [
#     'continent',
#     'country',
    'region',
    'city',
    'top_product_u',
    'top_category_u',
    'buy_time_1',
    'buy_time_2', 
    'buy_time_3', 
    'buy_day_1', 
    'buy_day_2', 
]

removed_numeric_columns = [ 
    'buy_time_1',
    'buy_time_2', 
    'buy_time_3', 
    'buy_day_1', 
    'buy_day_2', 
]
categorical_columns = [c for c in useful_columns if c not in numeric_columns]
removed_categorical_columns = [c for c in removed_columns if c not in removed_numeric_columns]    

In [None]:
corr_arr = copy([c for c in numeric_columns if c not in removed_numeric_columns])
corr_arr.append(labeled_column)

correlation_num = df_users[corr_arr].corr(method = 'pearson')
correlation_num.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)

In [None]:
for c in categorical_columns:
    print(c + ': ' + str(len(df_users[c].unique())))

In [None]:
df_users = df_users.drop(removed_categorical_columns,axis=1)
df_users = df_users.drop(removed_numeric_columns,axis=1)

In [None]:
colors = ["#50D4F0", "#AEF06C", "#FACE63"]
sns.set(style="darkgrid")
sns.set_palette(sns.color_palette(colors))

def boxplot_visualization(x,y):
    fig, axes = plt.subplots()
    axes = sns.boxplot(x, y, data=df_users)
    axes.set_yscale('log')

In [None]:
columns = ['n_pageviews_buy','n_pageviews_no_buy','n_input_buy','n_input_no_buy','n_click_buy','n_click_no_buy',
         'n_mouse_move_buy','n_mouse_move_no_buy','n_mouse_click_buy','n_mouse_click_no_buy',
         'n_scroll_move_buy','n_scroll_move_no_buy','n_events_buy','n_events_no_buy']

for c in columns:
    boxplot_visualization('repeat_buyer', c)
    
    
columns = ['n_product_buy','n_product_no_buy','n_non_product_buy','n_non_product_no_buy','n_category_buy','n_category_no_buy',
         'n_filter_buy','n_filter_no_buy','n_search_buy','n_search_no_buy','n_cart_buy','n_cart_no_buy',
         'n_add_to_cart_buy','n_add_to_cart_no_buy',
#          'n_next_page_buy','n_next_page_no_buy'
          ]

for c in columns:
    boxplot_visualization('repeat_buyer', c)
    
    
columns = ['effective_duration_buy','effective_duration_no_buy','pv_product_mean_eff_duration_buy','pv_product_mean_eff_duration_no_buy',
         'load_time_buy','load_time_no_buy','n_sessions_after_1_buy','num_of_sessions']

for c in columns:
    boxplot_visualization('repeat_buyer', c)

In [None]:
fig, axes = plt.subplots()
axes = sns.boxplot(x="variable", y="value", data = pd.melt(df_users[['time_1','time_2','time_3']]))
axes.set_yscale('log')

fig, axes = plt.subplots()
axes = sns.boxplot(x="variable", y="value", data = pd.melt(df_users[['day_1','day_2']]))
axes.set_yscale('log')

In [None]:
removed = ['n_sessions_after_1_buy','num_of_sessions']
df_users = df_users.drop(removed,axis=1)

In [None]:
df_users.to_csv(folder + '/df_users_columns_' + str(selection_method) + '.csv', index=False, encoding='utf-8-sig')