In [1]:
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.merge(
        pd.read_csv('../data/ga_sessions.csv', dtype={'client_id': str}),
        pd.read_csv('../data/ga_hits.csv'),
        on='session_id'
    )
df.head()

Unnamed: 0,session_id,client_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,...,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,event_value
0,9055434745589932991.1637753792.1637753792,2108382700.1637757,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,2021-11-24,3665.0,3,event,,podpiska.sberauto.com/,sub_page_view,sub_landing,,
1,9055434745589932991.1637753792.1637753792,2108382700.1637757,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,2021-11-24,46592.0,4,event,,podpiska.sberauto.com/,sub_button_click,sub_view_cars_click,vodKSlUobUWTVlgsJqdI,
2,905544597018549464.1636867290.1636867290,210838531.16368672,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,...,2021-11-14,921.0,3,event,,podpiska.sberauto.com/,sub_page_view,sub_landing,,
3,9055446045651783499.1640648526.1640648526,2108385331.164065,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,2021-12-28,84155.0,10,event,,sberauto.com/cars?utm_source_initial=sbol&utm_...,search_form,search_form_region,KWTCzSIXzoqUWjfUQMgP,
4,9055446045651783499.1640648526.1640648526,2108385331.164065,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,...,2021-12-28,102979.0,13,event,,sberauto.com/cars?utm_source_initial=sbol&utm_...,search_form,search_form_region,CBqnBQaKoQUyWJhLcxxN,


# Data Cleaning

In [3]:
actions = {'sub_car_claim_click', 'sub_car_claim_submit_click', 'sub_open_dialog_click',
               'sub_custom_question_submit_click', 'sub_call_number_click', 'sub_callback_submit_click',
               'sub_submit_success', 'sub_car_request_submit_click'}

df_prepared = df.filter(regex='^(utm|device|geo)_.+').copy()
df_prepared['target_action'] = df.event_action.apply(lambda event_action: int(event_action in actions))

print(df_prepared.shape)
df_prepared.head()

(15685219, 14)


Unnamed: 0,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
0,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
2,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow,0
3,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0
4,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0


In [4]:
df_prepared.drop_duplicates(inplace=True)
df_prepared.shape

(418657, 14)

In [5]:
df_prepared.isna().sum()

utm_source                      41
utm_medium                       0
utm_campaign                 45310
utm_adcontent                80028
utm_keyword                 214089
device_category                  0
device_os                   211680
device_brand                 91397
device_model                408998
device_screen_resolution         0
device_browser                   0
geo_country                      0
geo_city                         0
target_action                    0
dtype: int64

In [6]:
for column in df_prepared:
    mode = df_prepared[column].mode()[0]
    df_prepared[column] = df_prepared[column].fillna(mode)
df_prepared.isna().sum()

utm_source                  0
utm_medium                  0
utm_campaign                0
utm_adcontent               0
utm_keyword                 0
device_category             0
device_os                   0
device_brand                0
device_model                0
device_screen_resolution    0
device_browser              0
geo_country                 0
geo_city                    0
target_action               0
dtype: int64

# Feature Engineering

In [7]:
target_action = df_prepared.target_action
df_prepared.drop(columns='target_action', inplace=True)

ohe = OneHotEncoder(sparse_output=False)
ohe.fit(df_prepared)

df_prepared = pd.DataFrame(ohe.transform(df_prepared), columns=ohe.get_feature_names_out(), index=df_prepared.index)
df_prepared['target_action'] = target_action
df_prepared.head()

Unnamed: 0,utm_source_AHgbtEjTEsiUMJouiDYS,utm_source_ANoZJgYuPrWNkAAchryx,utm_source_ArbfvYgWhqxkzywKqpQf,utm_source_AuJjYKxJakEqFnCmyFtz,utm_source_BAZCuyHZnaPrMGOMrcCQ,utm_source_BHcvLfOaCWvWTykYqHVe,utm_source_BKeImrJuRDZcHiSSTdzm,utm_source_BellrslNBZQZaIxVFGXJ,utm_source_BmzdZLeVUBKtYeegQdDw,utm_source_BqQyRtXZyotBYQPhnHTC,...,geo_city_Zuhres,geo_city_Zurich,geo_city_Zvenigorod,geo_city_Zvenyhorodka,geo_city_Zwickau,geo_city_Zwolle,geo_city_Тарасовка,geo_city_Тимофеевка,geo_city_Хомутово,target_action
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


# Modeling

In [8]:
x = df_prepared.drop(columns='target_action')
y = df_prepared.target_action
lr = LogisticRegression(random_state=42, solver='liblinear').fit(x, y)

print(cross_val_score(lr, x, y, cv=5))
print(roc_auc_score(y, lr.predict_proba(x)[:, 1]))

[0.94052453 0.94156356 0.94161063 0.94157481 0.94159869]
0.78728858297155
