### Import libraries

In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
import sys
import os

sys.path.append(os.path.abspath("../"))
from fe_modules.aggregates import get_agg_count, get_agg_sum, get_top_n_mode, \
                                  get_agg_mean, get_agg_max, get_agg_min, \
                                  get_agg_median, get_agg_std, get_agg_nunique, \
                                  get_price_of_all_cpes
from fe_modules.cyclical_features import generate_cyclical_features
from fe_modules.datetime_features import get_year, get_month, get_day, \
                                         get_timestamp, get_day_of_year, \
                                         get_day_of_week, get_holiday_name, \
                                         part_of_day_to_hour, add_hour_to_date, \
                                         get_relative_time, mean_first_visit, \
                                         mean_last_visit
from fe_modules.encoders import TargetEncoderWrapper
from fe_modules.feature_merges import add_cat_features
from fe_modules.geo_features import map_cities, geo_dist, dist_to_large_cities, \
                                    map_grid, MapGridTransformer, process_utc
from fe_modules.missing import map_prices
from fe_modules.preprocessing import clean_os_type
from fe_modules.text_manipulation import get_domain
from fe_modules.time_lags import generate_time_lags

from fe_modules.user_fe import UserFE

from modules.memory_utils import pandas_reduce_mem_usage

In [3]:
random_state = 69

random.seed(random_state),
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

### Read Data

In [4]:
# df = pandas_reduce_mem_usage( 
#         pd.read_parquet(f'context_data/competition_data_final_pqt/part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet')
# )
# df
df = pandas_reduce_mem_usage( 
        pd.read_parquet(f'../ten_millions.parquet')
)
df

Memory usage of dataframe is 772.48 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 772.48 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799


In [5]:
userfe = UserFE(df)

Memory usage of dataframe is 0.45 MB


  0%|          | 0/7 [00:00<?, ?it/s]

Memory usage after optimization is: 0.45 MB
Decreased by 0.0%


### Feature Engineering

In [6]:
df = map_prices(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 848.77 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["price"][df["price"].isnull()] = df["missing_price"]


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 848.77 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799


In [7]:
userfe.get_agg_mean(df, target_col="price", alias="mean_price")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.56 MB


  0%|          | 0/8 [00:00<?, ?it/s]

Memory usage after optimization is: 0.56 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,date,part_of_day,user_id,mean_price
0,Apple,iPhone 7,smartphone,iOS,2022-06-15,morning,45098,20368.0
1,Xiaomi,Redmi 5 Plus,smartphone,Android,2021-08-03,evening,117132,4990.0
2,Samsung,Galaxy S20+,smartphone,Android,2021-09-26,night,79395,74259.0
3,Xiaomi,Poco X3 Pro,smartphone,Android,2022-06-13,day,91294,23876.0
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,2021-07-08,evening,161323,20465.0


In [8]:
userfe.get_agg_count(df, target_col="url_host", alias="actions_number")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.63 MB


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage after optimization is: 0.58 MB
Decreased by 8.3%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,date,part_of_day,user_id,mean_price,actions_number
0,Apple,iPhone 7,smartphone,iOS,2022-06-15,morning,45098,20368.0,1550
1,Xiaomi,Redmi 5 Plus,smartphone,Android,2021-08-03,evening,117132,4990.0,722
2,Samsung,Galaxy S20+,smartphone,Android,2021-09-26,night,79395,74259.0,1635
3,Xiaomi,Poco X3 Pro,smartphone,Android,2022-06-13,day,91294,23876.0,1570
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,2021-07-08,evening,161323,20465.0,313


In [9]:
userfe.get_agg_sum(df, target_col="request_cnt", alias="request_sum")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.65 MB


  0%|          | 0/10 [00:00<?, ?it/s]

Memory usage after optimization is: 0.59 MB
Decreased by 8.1%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,date,part_of_day,user_id,mean_price,actions_number,request_sum
0,Apple,iPhone 7,smartphone,iOS,2022-06-15,morning,45098,20368.0,1550,2261
1,Xiaomi,Redmi 5 Plus,smartphone,Android,2021-08-03,evening,117132,4990.0,722,1286
2,Samsung,Galaxy S20+,smartphone,Android,2021-09-26,night,79395,74259.0,1635,3306
3,Xiaomi,Poco X3 Pro,smartphone,Android,2022-06-13,day,91294,23876.0,1570,2437
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,2021-07-08,evening,161323,20465.0,313,441


In [10]:
userfe.get_top_n_mode(df, target_col="url_host", n=3)
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.84 MB


  0%|          | 0/13 [00:00<?, ?it/s]

Memory usage after optimization is: 0.84 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,date,part_of_day,user_id,mean_price,actions_number,request_sum,user_id_mode_0,user_id_mode_1,user_id_mode_2
0,Apple,iPhone 7,smartphone,iOS,2022-06-15,morning,45098,20368.0,1550,2261,apple.com,i.ytimg.com,googleads.g.doubleclick.net
1,Xiaomi,Redmi 5 Plus,smartphone,Android,2021-08-03,evening,117132,4990.0,722,1286,vk.com,sun9-27.userapi.com,sun9-32.userapi.com
2,Samsung,Galaxy S20+,smartphone,Android,2021-09-26,night,79395,74259.0,1635,3306,i.ytimg.com,yandex.ru,googleads.g.doubleclick.net
3,Xiaomi,Poco X3 Pro,smartphone,Android,2022-06-13,day,91294,23876.0,1570,2437,i.ytimg.com,googleads.g.doubleclick.net,yandex.ru
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,2021-07-08,evening,161323,20465.0,313,441,yandex.ru,googleads.g.doubleclick.net,i.ytimg.com


In [6]:
df = clean_os_type(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 848.77 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 848.77 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799


In [7]:
df = get_agg_count(df, target_col="url_host")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 925.06 MB


  0%|          | 0/13 [00:00<?, ?it/s]

Memory usage after optimization is: 867.84 MB
Decreased by 6.2%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381


In [8]:
df = get_agg_sum(df, agg_col="user_id", target_col="request_cnt", alias="user_id_request_sum")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 944.14 MB


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 886.92 MB
Decreased by 6.1%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008


#### Checkpoint 1

In [9]:
df.to_parquet("../checkpoint_1.parquet")

In [4]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("../checkpoint_1.parquet")
)
df

Memory usage of dataframe is 886.92 MB


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 886.92 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008


In [5]:
df = get_agg_mean(df, target_col="price", alias="user_id_mean_price")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 925.06 MB


  0%|          | 0/15 [00:00<?, ?it/s]

Memory usage after optimization is: 925.06 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0


In [6]:
df = get_top_n_mode(df, target_col="url_host", n=3)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1192.09 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 1192.09 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net


In [7]:
df = part_of_day_to_hour(df, return_dtype="int", alias="hour_int")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1268.39 MB


  0%|          | 0/19 [00:00<?, ?it/s]

Memory usage after optimization is: 1201.63 MB
Decreased by 5.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3


In [8]:
%%time
df = mean_first_visit(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1277.92 MB


  0%|          | 0/20 [00:00<?, ?it/s]

Memory usage after optimization is: 1239.78 MB
Decreased by 3.0%
CPU times: total: 28.6 s
Wall time: 28.9 s


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594


In [9]:
%%time
df = mean_last_visit(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1316.07 MB


  0%|          | 0/21 [00:00<?, ?it/s]

Memory usage after optimization is: 1277.92 MB
Decreased by 2.9%
CPU times: total: 29.1 s
Wall time: 29.2 s


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv,mean_lv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594,21.000000
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000


#### Checkpoint 2

In [10]:
df.to_parquet("../checkpoint_2.parquet")

In [4]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("../checkpoint_2.parquet")
)
df

Memory usage of dataframe is 1277.92 MB


  0%|          | 0/21 [00:00<?, ?it/s]

Memory usage after optimization is: 1239.78 MB
Decreased by 3.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv,mean_lv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594,21.000000
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
