In [44]:
import gc
import sys
import os
import warnings

sys.path.append(os.path.abspath("../"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [45]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
from tqdm.auto import tqdm
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [46]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat
from modules.sanity_checks import null_check
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime_features import get_timestamp, \
                                         get_relative_time, \
                                         part_of_day_to_hour, \
                                         add_hour_to_date, \
                                         get_year, \
                                         get_month, \
                                         get_day, \
                                         get_day_of_year, \
                                         get_day_of_week, \
                                         get_holiday_name
from fe_modules.preprocessing import clean_os_type
from fe_modules.missing import map_prices
from fe_modules.geo_features import map_cities, dist_to_large_cities

In [47]:
def my_reset(*varnames):
    """
    varnames are what you want to keep
    """
    globals_ = globals()
    to_save = {v: globals_[v] for v in varnames}
    to_save['my_reset'] = my_reset  # lets keep this function by default
    del globals_
    get_ipython().magic("reset")
    globals().update(to_save)

In [53]:
DATA_PATH = '../data/'
SEQ2SEQ_DATA_PATH = '../seq2seq_data/'

SPLIT_SEED = 42

# Load data

In [6]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(
            f'{LOCAL_DATA_PATH}/competition_data_final_pqt/'
    ))
df.head()

Memory usage of dataframe is 29562.33 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 24943.21 MB
Decreased by 15.6%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


# Feature engeneering

## Cleaning

In [7]:
df = clean_os_type(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


## Fill missing

In [8]:
df = pandas_reduce_mem_usage(map_prices(df))
df.head()

Memory usage of dataframe is 27406.74 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 27406.74 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


## Feature generation

### Time Features

In [10]:
df = get_year(df)
df = get_month(df)
df = get_day(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 34797.32 MB


  0%|          | 0/15 [00:00<?, ?it/s]

Memory usage after optimization is: 28638.50 MB
Decreased by 17.7%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,2022,6,15
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,2022,6,19
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,2022,6,12
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,2022,5,16
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,2022,5,30


In [11]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [16]:
df = pandas_string_to_cat(df, 
                            ["region_name", 
                             "year",
                             "cpe_manufacturer_name", 
                             "cpe_type_cd", 
                             "cpe_model_os_type",
                            ]
                           )
df.head()

Memory usage of dataframe is 28638.50 MB
Memory usage of dataframe is 27.97 GB


  0%|          | 0/5 [00:00<?, ?it/s]

Memory usage after optimization is: 19708.22 MB
Decreased by 31.2%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30


In [18]:
df = get_day_of_year(df)
df = get_day_of_week(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 24635.27 MB


  0%|          | 0/17 [00:00<?, ?it/s]

Memory usage after optimization is: 20632.04 MB
Decreased by 16.2%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0


In [19]:
df = get_holiday_name(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 23095.57 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 23095.57 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,Не праздник
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,Не праздник
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,День России
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,Не праздник
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,Не праздник


In [21]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [27]:
df = pandas_string_to_cat(df, 
                            [
                             "holiday"
                            ]
                           )

df.head()

Memory usage of dataframe is 23095.57 MB
Memory usage of dataframe is 22.55 GB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 20939.98 MB
Decreased by 9.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,5
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,5
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,1
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,5
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,5


In [30]:
df.to_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_1.parquet.gzip",
              compression='gzip')

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_1.parquet.gzip")
)
df.head()

Memory usage of dataframe is 20939.98 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 20939.98 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,5
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,5
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,1
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,5
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,5


In [7]:
df = part_of_day_to_hour(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,hour
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,5,0 days 09:00:00
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,5,0 days 09:00:00
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,1,0 days 15:00:00
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,5,0 days 15:00:00
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,5,0 days 15:00:00


In [8]:
df = add_hour_to_date(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,hour,datetime
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,5,0 days 09:00:00,2022-06-15 09:00:00
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,5,0 days 09:00:00,2022-06-19 09:00:00
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,1,0 days 15:00:00,2022-06-12 15:00:00
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,5,0 days 15:00:00,2022-05-16 15:00:00
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,5,0 days 15:00:00,2022-05-30 15:00:00


In [9]:
df = df.drop(labels=["date", "hour"], axis=1)
gc.collect()
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,datetime
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,morning,1,45098,1,6,15,166,2,5,2022-06-15 09:00:00
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,morning,1,45098,1,6,19,170,6,5,2022-06-19 09:00:00
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,day,1,45098,1,6,12,163,6,1,2022-06-12 15:00:00
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,16,136,0,5,2022-05-16 15:00:00
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,30,150,0,5,2022-05-30 15:00:00


In [10]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [16]:
df = pandas_reduce_mem_usage(get_timestamp(df), columns=["timestamp"])
df.head()

Memory usage of dataframe is 23403.51 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 22171.74 MB
Decreased by 5.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,datetime,timestamp
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,morning,1,45098,1,6,15,166,2,5,2022-06-15 09:00:00,165528352.0
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,morning,1,45098,1,6,19,170,6,5,2022-06-19 09:00:00,165562912.0
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,day,1,45098,1,6,12,163,6,1,2022-06-12 15:00:00,165504608.0
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,16,136,0,5,2022-05-16 15:00:00,165271328.0
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,30,150,0,5,2022-05-30 15:00:00,165392288.0


In [17]:
df = pandas_reduce_mem_usage(get_relative_time(df).drop(labels=["datetime"], axis=1), columns=["relative_timestamp"])
df.head()

Memory usage of dataframe is 4.75 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 4.75 MB
Decreased by 0.0%
Memory usage of dataframe is 23403.51 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 23403.51 MB
Decreased by 0.0%
Memory usage of dataframe is 20939.98 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 20939.98 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,timestamp,relative_timestamp
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,morning,1,45098,1,6,15,166,2,5,165528352.0,388.799988
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,morning,1,45098,1,6,19,170,6,5,165562912.0,43.200001
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,day,1,45098,1,6,12,163,6,1,165504608.0,626.23999
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,16,136,0,5,165271328.0,2959.040039
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,day,1,45098,1,5,30,150,0,5,165392288.0,1749.439941


In [18]:
df = pandas_string_to_cat(df, 
                            [
                            "part_of_day"
                            ]
                         )
df.head()

Memory usage of dataframe is 20939.98 MB
Memory usage of dataframe is 20.45 GB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 18784.39 MB
Decreased by 10.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday,timestamp,relative_timestamp
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2,1,45098,1,6,15,166,2,5,165528352.0,388.799988
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2,1,45098,1,6,19,170,6,5,165562912.0,43.200001
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,0,1,45098,1,6,12,163,6,1,165504608.0,626.23999
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,45098,1,5,16,136,0,5,165271328.0,2959.040039
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,45098,1,5,30,150,0,5,165392288.0,1749.439941


### Geo Features

In [19]:
df = map_cities(df)
df.head()

Memory usage of dataframe is 0.05 MB


  0%|          | 0/6 [00:00<?, ?it/s]

Memory usage after optimization is: 0.03 MB
Decreased by 47.8%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,day_of_year,day_of_week,holiday,timestamp,relative_timestamp,capital_marker,timezone,geo_lat,geo_lon,population
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2,1,...,166,2,5,165528352.0,388.799988,2.0,3,45.040161,38.975964,744933
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2,1,...,170,6,5,165562912.0,43.200001,2.0,3,45.040161,38.975964,744933
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,0,1,...,163,6,1,165504608.0,626.23999,2.0,3,45.040161,38.975964,744933
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,136,0,5,165271328.0,2959.040039,2.0,3,45.040161,38.975964,744933
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,150,0,5,165392288.0,1749.439941,2.0,3,45.040161,38.975964,744933


In [20]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [27]:
df = pandas_reduce_mem_usage(dist_to_large_cities(df))
df.head()

  0%|          | 0/5 [00:00<?, ?it/s]

Memory usage of dataframe is 0.06 MB


  0%|          | 0/8 [00:00<?, ?it/s]

Memory usage after optimization is: 0.04 MB
Decreased by 31.2%
Memory usage of dataframe is 30737.14 MB


  0%|          | 0/29 [00:00<?, ?it/s]

Memory usage after optimization is: 30737.14 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,capital_marker,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391


In [28]:
df = pandas_string_to_cat(df, 
                            [
                            "cpe_model_name",
                            "city_name", 
                            ]
                           )
df.head()

Memory usage of dataframe is 30737.14 MB
Memory usage of dataframe is 30.02 GB


  0%|          | 0/2 [00:00<?, ?it/s]

Memory usage after optimization is: 26973.41 MB
Decreased by 12.2%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,capital_marker,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
1,21,409,1,589,apple.com,2,1,20368.0,2,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,2.0,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391


### Text features

In [29]:
df = get_domain(df)
df.head()

100%|██████████████████████████████| 199683/199683 [00:00<00:00, 1735959.88it/s]


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,ru
1,21,409,1,589,apple.com,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,com
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,net
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,net
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,net


In [30]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [37]:
df = pandas_string_to_cat(df, 
                            [
                            "domain",
                            ]
                           )
df.head()

Memory usage of dataframe is 29482.56 MB
Memory usage of dataframe is 28.79 GB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 27600.70 MB
Decreased by 6.4%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,timezone,geo_lat,geo_lon,population,dist_to_Moscow,dist_to_SaintP,dist_to_Novosibirsk,dist_to_Ekaterinburg,dist_to_Vladivostok,domain
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,761
1,21,409,1,589,apple.com,2,1,20368.0,2,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,549
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,3,45.040161,38.975964,744933,1195.817871,1755.62085,3275.137451,1992.558472,6999.525391,712


In [40]:
zero_shot = pandas_reduce_mem_usage(pd.read_csv("../external_data/sites_zero_shot.csv", index_col=0)).drop(columns=["tod"])
zero_shot["url_host"] = zero_shot["url_host"].apply(lambda x: x[7:])
zero_shot.head()

Memory usage of dataframe is 1.31 MB


  0%|          | 0/17 [00:00<?, ?it/s]

Memory usage after optimization is: 0.76 MB
Decreased by 41.7%


Unnamed: 0,url_host,Сайты для женщин,Сайты для мужчин,Сайт для взрослых людей,Сайт для молодежи,Сайт для пенсионеров,Блог,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть
0,googleads.g.doubleclick.net,0.466345,0.533655,0.541565,0.194651,0.263783,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792
1,yandex.ru,0.493599,0.506401,0.397753,0.282165,0.320082,0.100165,0.069372,0.076529,0.082483,0.06275,0.270802,0.059558,0.068429,0.127557,0.082355
2,i.ytimg.com,0.50159,0.49841,0.49674,0.22133,0.28193,0.10359,0.081871,0.068195,0.093617,0.091981,0.209684,0.077605,0.076367,0.123756,0.073332
3,vk.com,0.432394,0.567605,0.753898,0.168512,0.07759,0.000687,0.000575,0.001223,0.000625,0.001397,0.001529,0.00036,0.00042,0.003874,0.989309
4,avatars.mds.yandex.net,0.359512,0.640488,0.434831,0.308187,0.256982,0.08717,0.082425,0.087657,0.096718,0.113347,0.138035,0.067645,0.076701,0.168411,0.081891


In [41]:
df = df.merge(zero_shot, how="left", on="url_host")
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Блог,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2,1,...,0.02206,0.010498,0.021562,0.127385,0.121726,0.079564,0.008757,0.008911,0.584294,0.015242
1,21,409,1,589,apple.com,2,1,20368.0,2,1,...,0.024201,0.734256,0.02749,0.03459,0.055687,0.025109,0.016231,0.016883,0.040856,0.024697
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,0,1,...,0.08717,0.082425,0.087657,0.096718,0.113347,0.138035,0.067645,0.076701,0.168411,0.081891
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792


In [42]:
df.isnull().sum() / df.shape[0]

region_name                0.000000
city_name                  0.000000
cpe_manufacturer_name      0.000000
cpe_model_name             0.000000
url_host                   0.000000
cpe_type_cd                0.000000
cpe_model_os_type          0.000000
price                      0.000000
part_of_day                0.000000
request_cnt                0.000000
user_id                    0.000000
year                       0.000000
month                      0.000000
day                        0.000000
day_of_year                0.000000
day_of_week                0.000000
holiday                    0.000000
timestamp                  0.000000
relative_timestamp         0.000000
capital_marker             0.000000
timezone                   0.000000
geo_lat                    0.000000
geo_lon                    0.000000
population                 0.000000
dist_to_Moscow             0.000000
dist_to_SaintP             0.000000
dist_to_Novosibirsk        0.000000
dist_to_Ekaterinburg       0

In [49]:
df = df.fillna(-1)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Блог,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2,1,...,0.02206,0.010498,0.021562,0.127385,0.121726,0.079564,0.008757,0.008911,0.584294,0.015242
1,21,409,1,589,apple.com,2,1,20368.0,2,1,...,0.024201,0.734256,0.02749,0.03459,0.055687,0.025109,0.016231,0.016883,0.040856,0.024697
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,0,1,...,0.08717,0.082425,0.087657,0.096718,0.113347,0.138035,0.067645,0.076701,0.168411,0.081891
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792


In [50]:
df = pandas_string_to_cat(df, 
                            [
                            "url_host"
                            ]
                           )
df.head()

Memory usage of dataframe is 46419.35 MB
Memory usage of dataframe is 45.33 GB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 45164.78 MB
Decreased by 2.7%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,Блог,Интернет-магазин,Интернет-форум,Информационный сайт,Корпоративный сайт,Поисковая система,Порно-сайт,Почтовый сервис,Сайт-сервис,Социальная сеть
0,21,409,1,589,5788,2,1,20368.0,2,1,...,0.02206,0.010498,0.021562,0.127385,0.121726,0.079564,0.008757,0.008911,0.584294,0.015242
1,21,409,1,589,12900,2,1,20368.0,2,1,...,0.024201,0.734256,0.02749,0.03459,0.055687,0.025109,0.016231,0.016883,0.040856,0.024697
2,21,409,1,589,17626,2,1,20368.0,0,1,...,0.08717,0.082425,0.087657,0.096718,0.113347,0.138035,0.067645,0.076701,0.168411,0.081891
3,21,409,1,589,59366,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792
4,21,409,1,589,59366,2,1,20368.0,0,1,...,0.028243,0.056895,0.048744,0.062024,0.464972,0.193379,0.031416,0.018421,0.047987,0.04792


In [51]:
my_reset("df")

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [54]:
df.to_parquet(f"{SEQ2SEQ_DATA_PATH}/stages/stage_2.parquet.gzip",
              compression='gzip')