In [None]:
# Setting auto reloading for imported modules
# %load_ext autoreload
# %autoreload 2

In [None]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from matplotlib.ticker import FormatStrFormatter
from auxiliary_functions import PreprocessingUtils

In [None]:
prep_utils = PreprocessingUtils()

In [None]:
ga_sessions_path = 'data/skillbox_diploma_main_dataset_sberautopodpiska/ga_sessions.csv'
sessions_df = pd.read_csv(ga_sessions_path, low_memory=False)
sessions_df.head()

In [None]:
ga_hits_path = 'data/ga_hits-002.csv'
hits_df = pd.read_csv(ga_hits_path, low_memory=False)
hits_df.head()

**Data preprocessing**

In [None]:
prep_utils.print_basic_stats(sessions_df, level='shape')

In [None]:
prep_utils.print_basic_stats(hits_df, level='shape')

In [None]:
sessions_df.describe(include='all')

In [None]:
hits_df.describe(include='all')

In [None]:
prep_utils.check_duplicates(sessions_df)
prep_utils.check_duplicates(hits_df)

In [None]:
msno.matrix(sessions_df);

In [None]:
msno.matrix(hits_df);

*Handle Missing Values*
<br>
*Sessions dataframe*

In [None]:
sessions_df_clean = sessions_df.copy()

In [None]:
sessions_df_clean.head()

In [None]:
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.device_model.notna()]

In [None]:
sessions_df_clean.device_model.value_counts()

In [None]:
sessions_df_clean = sessions_df_clean.drop(columns=['device_model'], axis=1)
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.utm_source.isna()].head(20)

In [None]:
sessions_df_clean[(sessions_df_clean.utm_source.isna()) & (sessions_df_clean.visit_number == 1)]

In [None]:
sessions_df_clean[sessions_df_clean.utm_source.notna()].head(10)

In [None]:
sessions_df_clean.utm_source = sessions_df_clean.utm_source.fillna('(not set)')
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.device_os.isna()].head()

In [None]:
sessions_df_clean.device_os.value_counts()

In [None]:
brands_w_missing_os = sessions_df_clean[sessions_df_clean.device_os.isna()].device_brand.value_counts().to_dict()
brands_w_missing_os

In [None]:
sessions_df_clean.loc[(sessions_df_clean.device_os.isna()) & (sessions_df_clean.device_brand == 'Apple'), 'device_os'] = 'iOS'

In [None]:
android_based = ['Samsung', 'Xiaomi', 'Huawei', 'Realme']
sessions_df_clean.loc[(sessions_df_clean.device_os.isna()) & (sessions_df_clean.device_brand.isin(android_based)), 'device_os'] = 'Android'

In [None]:
sessions_df_clean.device_os = sessions_df_clean.device_os.fillna('(not set)')

In [None]:
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.utm_keyword.isna()].head(20)

In [None]:
sessions_df_clean[sessions_df_clean.utm_keyword.notna()].head(20)

In [None]:
utm_keyword_values = sessions_df_clean.utm_keyword.value_counts(dropna=False).to_dict()
utm_keyword_values

In [None]:
sessions_df_clean.utm_keyword = sessions_df_clean.utm_keyword.fillna('(not set)')
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.utm_adcontent.isna()].head(20)

In [None]:
sessions_df_clean.utm_adcontent = sessions_df_clean.utm_adcontent.fillna('(not set)')
sessions_df_clean.utm_campaign = sessions_df_clean.utm_campaign.fillna('(not set)')
prep_utils.missing_values_percentage(sessions_df_clean)

In [None]:
sessions_df_clean[sessions_df_clean.device_brand.isna()].head(20)

In [None]:
sessions_df_clean.device_brand = sessions_df_clean.device_brand.fillna('(not set)')
prep_utils.missing_values_percentage(sessions_df_clean)