In [None]:
# Setting auto reloading for imported modules
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from auxiliary_functions import PreprocessingUtils

In [None]:
prep_utils = PreprocessingUtils()

**Feature Engineering**

In [None]:
df_features = pd.read_csv('../data/clean_data/clean_sessions_with_cr.csv')
df_features.head()

In [None]:
df_features['is_organic'] = df_features['utm_medium'].apply(lambda x: 1 if x in ('organic', 'referral', '(none)') else 0)

In [None]:
social_media_sources = ['QxAxdyPLuQMEcrdZWdWb', 'MvfHsxITijuriZxsqZqt', 'ISrKoXQCxqqYvAZICvjs', 'IZEXUFLARCUMynmHNBGo', 'PlbkrSYoHuZBWfYjYnfw', 'gVRrcxiDQubJiljoTbGm']
df_features['is_social_media_ad'] = df_features['utm_source'].apply(lambda x: 1 if x in social_media_sources else 0)

In [None]:
df_features['in_app_browser'] = df_features.device_browser.apply(lambda x: 1 if x == 'safari (in-app)' or '.' in x else 0)

In [None]:
top_browsers = ['chrome', 'safari', 'firefox']
df_features['is_top_browser'] = df_features.device_browser.apply(lambda x: 1 if x in top_browsers else 0)

In [None]:
popular_brands = ['samsung', 'apple', 'xiaomi', 'huawei']
df_features['is_popular_brand'] = df_features.device_brand.apply(lambda x: 1 if x in popular_brands else 0)

In [None]:
df_features['screen_width'] = df_features.device_screen_resolution.apply(lambda x: x.split('x')[0]).astype('int')
df_features['screen_height'] = df_features.device_screen_resolution.apply(lambda x: x.split('x')[1]).astype('int')
df_features = df_features.drop(columns=['device_screen_resolution'], axis=1)

In [None]:
df_features.head()

Categorical data transformation

In [None]:
categorical_features = df_features.select_dtypes(include=['object']).columns
for feature in categorical_features:
    print(f'Column {feature} have {df_features[feature].nunique()} unique values')

Using OneHotEncoder for all of these values will cause high dimensionality of the dataset and will cause poor performance, hence for some features with high cardinality target encoding will be used.

In [None]:
high_cardinality_features = []
low_cardinality_features = []

for feature in categorical_features:
    if df_features[feature].nunique() > 400:
        high_cardinality_features.append(feature)
    else:
        low_cardinality_features.append(feature)

print(f'High cardinality features are: {high_cardinality_features}')
print(f'Low/Medium cardinality features are: {low_cardinality_features}')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_source')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_medium')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='utm_adcontent')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_category')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_os')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_brand')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='device_browser')

In [None]:
df_features = prep_utils.categorical_feature_ohe(df=df_features, column='geo_country')

In [None]:
df_features.head()

In [None]:
for feature in high_cardinality_features:
    df_features = prep_utils.categorical_feature_te(df=df_features, column=feature)

In [None]:
df_features.head()

Numerical data transformation

In [None]:
numeric_features = ['screen_width', 'screen_height']

for feature in numeric_features:
    prep_utils.numerical_feature_std(df=df_features, column=feature)

In [None]:
df_features.head()

In [None]:
df_features = df_features.drop(columns=numeric_features, axis=1)

In [None]:
df_features.to_csv('data/clean_data/df_sessions_w_feats.csv', index=False)