In [1]:
# Setting auto reloading for imported modules
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from auxiliary_functions import PreprocessingUtils

In [3]:
prep_utils = PreprocessingUtils()

**Feature Engineering**

In [5]:
df_features = pd.read_csv('../data/clean_data/clean_sessions_with_cr.csv')
df_features.head()

Unnamed: 0,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_screen_resolution,device_browser,geo_country,geo_city,CR
0,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,360x720,Chrome,Russia,Zlatoust,0
1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,385x854,Samsung Internet,Russia,Moscow,0
2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,360x720,Chrome,Russia,Krasnoyarsk,0
3,kjsLglQLzykiRbcDiGcD,cpc,(not set),NOBKLgtuvqYWkXQHeYWM,(not set),mobile,Android,Xiaomi,393x786,Chrome,Russia,Moscow,0
4,kjsLglQLzykiRbcDiGcD,cpc,(not set),(not set),(not set),mobile,Android,Xiaomi,393x786,Chrome,Russia,Moscow,0


In [5]:
df_features['is_organic'] = df_features['utm_medium'].apply(lambda x: 1 if x in ('organic', 'referral', '(none)') else 0)

In [6]:
social_media_sources = ['QxAxdyPLuQMEcrdZWdWb', 'MvfHsxITijuriZxsqZqt', 'ISrKoXQCxqqYvAZICvjs', 'IZEXUFLARCUMynmHNBGo', 'PlbkrSYoHuZBWfYjYnfw', 'gVRrcxiDQubJiljoTbGm']
df_features['is_social_media_ad'] = df_features['utm_source'].apply(lambda x: 1 if x in social_media_sources else 0)

In [8]:
df_features['in_app_browser'] = df_features.device_browser.apply(lambda x: 1 if x == 'safari (in-app)' or '.' in x else 0)

In [9]:
top_browsers = ['chrome', 'safari', 'firefox']
df_features['is_top_browser'] = df_features.device_browser.apply(lambda x: 1 if x in top_browsers else 0)

In [10]:
popular_brands = ['samsung', 'apple', 'xiaomi', 'huawei']
df_features['is_popular_brand'] = df_features.device_brand.apply(lambda x: 1 if x in popular_brands else 0)

In [11]:
df_features['screen_width'] = df_features.device_screen_resolution.apply(lambda x: x.split('x')[0]).astype('int')
df_features['screen_height'] = df_features.device_screen_resolution.apply(lambda x: x.split('x')[1]).astype('int')
df_features = df_features.drop(columns=['device_screen_resolution'], axis=1)

In [12]:
df_features.head()

Unnamed: 0,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_browser,geo_country,geo_city,CR,is_organic,is_social_media_ad,in_app_browser,is_top_browser,is_popular_brand,screen_width,screen_height
0,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,Chrome,Russia,Zlatoust,0,0,0,0,0,0,360,720
1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,Samsung Internet,Russia,Moscow,0,0,1,0,0,0,385,854
2,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,Chrome,Russia,Krasnoyarsk,0,0,0,0,0,0,360,720
3,kjsLglQLzykiRbcDiGcD,cpc,(not set),NOBKLgtuvqYWkXQHeYWM,(not set),mobile,Android,Xiaomi,Chrome,Russia,Moscow,0,0,0,0,0,0,393,786
4,kjsLglQLzykiRbcDiGcD,cpc,(not set),(not set),(not set),mobile,Android,Xiaomi,Chrome,Russia,Moscow,0,0,0,0,0,0,393,786


Categorical data transformation

In [13]:
categorical_features = df_features.select_dtypes(include=['object']).columns
for feature in categorical_features:
    print(f'Column {feature} have {df_features[feature].nunique()} unique values')

Column utm_source have 281 unique values
Column utm_medium have 55 unique values
Column utm_campaign have 407 unique values
Column utm_adcontent have 281 unique values
Column utm_keyword have 1193 unique values
Column device_category have 3 unique values
Column device_os have 13 unique values
Column device_brand have 200 unique values
Column device_browser have 55 unique values
Column geo_country have 159 unique values
Column geo_city have 2389 unique values


Using OneHotEncoder for all of these values will cause high dimensionality of the dataset and will cause poor performance, hence for some features with high cardinality target encoding will be used.

In [14]:
high_cardinality_features = []
low_cardinality_features = []

for feature in categorical_features:
    if df_features[feature].nunique() > 400:
        high_cardinality_features.append(feature)
    else:
        low_cardinality_features.append(feature)

print(f'High cardinality features are: {high_cardinality_features}')
print(f'Low/Medium cardinality features are: {low_cardinality_features}')

High cardinality features are: ['utm_campaign', 'utm_keyword', 'geo_city']
Low/Medium cardinality features are: ['utm_source', 'utm_medium', 'utm_adcontent', 'device_category', 'device_os', 'device_brand', 'device_browser', 'geo_country']


In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_source')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_medium')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_adcontent')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_category')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_os')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_brand')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_browser')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='geo_country')

In [None]:
df_features.head()

In [None]:
for feature in high_cardinality_features:
    df_features = prep_utils.categorical_feature_te(df=df_features, column=feature)

In [None]:
df_features.head()

Numerical data transformation

In [None]:
numeric_features = ['screen_width', 'screen_height']

for feature in numeric_features:
    prep_utils.numerical_feature_std(df=df_features, column=feature)

In [None]:
df_features.head()

In [None]:
df_features = df_features.drop(columns=numeric_features, axis=1)

In [None]:
# df_features.to_csv('data/clean_data/df_sessions_w_feats.csv', index=False)