In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.merge(
        pd.read_csv('../data/ga_sessions.csv', dtype={'client_id': str}),
        pd.read_csv('../data/ga_hits.csv'),
        on='session_id'
    ).set_index('client_id')
df.head()

Unnamed: 0_level_0,session_id,visit_date,visit_time,visit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,...,hit_date,hit_time,hit_number,hit_type,hit_referer,hit_page_path,event_category,event_action,event_label,event_value
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2108382700.1637757,9055434745589932991.1637753792.1637753792,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,...,2021-11-24,3665.0,3,event,,podpiska.sberauto.com/,sub_page_view,sub_landing,,
2108382700.1637757,9055434745589932991.1637753792.1637753792,2021-11-24,14:36:32,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,...,2021-11-24,46592.0,4,event,,podpiska.sberauto.com/,sub_button_click,sub_view_cars_click,vodKSlUobUWTVlgsJqdI,
210838531.16368672,905544597018549464.1636867290.1636867290,2021-11-14,08:21:30,1,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,...,2021-11-14,921.0,3,event,,podpiska.sberauto.com/,sub_page_view,sub_landing,,
2108385331.164065,9055446045651783499.1640648526.1640648526,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,...,2021-12-28,84155.0,10,event,,sberauto.com/cars?utm_source_initial=sbol&utm_...,search_form,search_form_region,KWTCzSIXzoqUWjfUQMgP,
2108385331.164065,9055446045651783499.1640648526.1640648526,2021-12-28,02:42:06,1,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,...,2021-12-28,102979.0,13,event,,sberauto.com/cars?utm_source_initial=sbol&utm_...,search_form,search_form_region,CBqnBQaKoQUyWJhLcxxN,


# Data Cleaning

In [3]:
actions = {'sub_car_claim_click', 'sub_car_claim_submit_click', 'sub_open_dialog_click',
               'sub_custom_question_submit_click', 'sub_call_number_click', 'sub_callback_submit_click',
               'sub_submit_success', 'sub_car_request_submit_click'}
df = pd.concat([
    df.hit_number,
    df.filter(regex='^(utm|device|geo)_.+'),
    df.event_action.apply(lambda event_action: int(event_action in actions)).rename('target_action')
], axis=1)

print(df.shape)
df.head()

(15685219, 15)


Unnamed: 0_level_0,hit_number,utm_source,utm_medium,utm_campaign,utm_adcontent,utm_keyword,device_category,device_os,device_brand,device_model,device_screen_resolution,device_browser,geo_country,geo_city,target_action
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2108382700.1637757,3,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
2108382700.1637757,4,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Zlatoust,0
210838531.16368672,3,MvfHsxITijuriZxsqZqt,cpm,FTjNLDyTrXaWYgZymFkV,xhoenQgDQsgfEPYNPwKO,IGUCNvHlhfHpROGclCit,mobile,Android,Samsung,,385x854,Samsung Internet,Russia,Moscow,0
2108385331.164065,10,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0
2108385331.164065,13,ZpYIoDJMcFzVoPFsHGJL,banner,LEoPHuyFvzoNfnzGgfcd,vCIpmpaGBnIQhyYNkXqp,puhZPIYqKXeFPaUviSjo,mobile,Android,Huawei,,360x720,Chrome,Russia,Krasnoyarsk,0


In [4]:
def outliers_upper_boundary(s: pd.Series) -> float:
    q25 = s.quantile(0.25)
    q75 = s.quantile(0.75)
    iqr = q75 - q25

    return q75 + 1.5 * iqr

In [5]:
boundary = outliers_upper_boundary(df.hit_number)
df = df.drop(index=df.hit_number.loc[lambda x: x > boundary].index).drop(columns='hit_number')
df.shape

(11896245, 14)

In [6]:
df.isna().sum()

utm_source                       307
utm_medium                         0
utm_campaign                 1545705
utm_adcontent                2080144
utm_keyword                  6875486
device_category                    0
device_os                    6834051
device_brand                 2543385
device_model                11796973
device_screen_resolution           0
device_browser                     0
geo_country                        0
geo_city                           0
target_action                      0
dtype: int64

In [7]:
for column in df:
    mode = df[column].mode()[0]
    df[column] = df[column].fillna(mode)
df.isna().sum()

utm_source                  0
utm_medium                  0
utm_campaign                0
utm_adcontent               0
utm_keyword                 0
device_category             0
device_os                   0
device_brand                0
device_model                0
device_screen_resolution    0
device_browser              0
geo_country                 0
geo_city                    0
target_action               0
dtype: int64

In [8]:
df.drop_duplicates(inplace=True)
df.shape

(379513, 14)

# Feature Engineering

In [9]:
x = df.drop(columns='target_action')
y = df.target_action

ohe = OneHotEncoder()
ohe.fit(x)
x = ohe.transform(x)
x.shape

(379513, 9894)

# Modeling

In [10]:
models = [
    LogisticRegression(random_state=42, solver='liblinear'),
    RandomForestClassifier(random_state=42, min_samples_leaf=10, max_depth=2000),
    MLPClassifier(random_state=42, early_stopping=True, batch_size=1000)
]

In [11]:
best_model = None
best_accuracy = 0
best_roc_auc = 0

for model in models:
    name = type(model).__name__
    accuracy = cross_val_score(model, x, y, cv=5)
    model.fit(x, y)
    roc_auc = roc_auc_score(y, model.predict_proba(x)[:, 1])

    print(f'model: {name}, accuracy: {accuracy}, roc_auc: {roc_auc}')
    if roc_auc > best_roc_auc:
        best_model = name
        best_accuracy = accuracy.mean()
        best_roc_auc = roc_auc

f'Best model: {best_model}, accuracy: {best_accuracy}, roc_auc: {best_roc_auc}'

model: LogisticRegression, accuracy: [0.94817069 0.94950134 0.94951451 0.9495402  0.94952702], roc_auc: 0.7977705774288784
model: RandomForestClassifier, accuracy: [0.94954086 0.94954086 0.94954086 0.94955337 0.94955337], roc_auc: 0.752526924087954
model: MLPClassifier, accuracy: [0.94690592 0.94954086 0.94954086 0.94955337 0.94955337], roc_auc: 0.7650348383808375


'Best model: LogisticRegression, accuracy: 0.9492507517943117, roc_auc: 0.7977705774288784'