### Import libraries

In [1]:
import numpy as np
import pandas as pd
import random
import torch

In [2]:
import sys
import os

sys.path.append(os.path.abspath("../"))
from fe_modules.aggregates import get_agg_count, get_agg_sum, get_top_n_mode, \
                                  get_agg_mean, get_agg_max, get_agg_min, \
                                  get_agg_median, get_agg_std, get_agg_nunique, \
                                  get_price_of_all_cpes, get_ratio_part_of_day
from fe_modules.cyclical_features import generate_cyclical_features
from fe_modules.datetime_features import get_year, get_month, get_day, \
                                         get_timestamp, get_day_of_year, \
                                         get_day_of_week, get_holiday_name, \
                                         part_of_day_to_hour, add_hour_to_date, \
                                         get_relative_time, mean_first_visit, \
                                         mean_last_visit
from fe_modules.encoders import TargetEncoderWrapper
from fe_modules.feature_merges import add_cat_features
from fe_modules.geo_features import map_cities, geo_dist, dist_to_large_cities, \
                                    map_grid, MapGridTransformer, process_utc
from fe_modules.missing import map_prices
from fe_modules.preprocessing import clean_os_type
from fe_modules.text_manipulation import get_domain
from fe_modules.time_lags import generate_time_lags

from fe_modules.user_fe import UserFE

from modules.memory_utils import pandas_reduce_mem_usage

In [3]:
random_state = 69

random.seed(random_state),
np.random.seed(random_state)
torch.manual_seed(random_state)
torch.cuda.manual_seed_all(random_state)

### Read Data

In [4]:
# df = pandas_reduce_mem_usage( 
#         pd.read_parquet(f'context_data/competition_data_final_pqt/part-00000-aba60f69-2b63-4cc1-95ca-542598094698-c000.snappy.parquet')
# )
# df
df = pandas_reduce_mem_usage( 
        pd.read_parquet(f'../10m.parquet')
)
df

Memory usage of dataframe is 963.21 MB


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 963.21 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,,
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,,
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,,
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,,


In [5]:
df["is_male"] = df["is_male"].fillna(value=np.nan)
df.loc[df["is_male"] == "NA", "is_male"] = np.nan
df["is_male"] = df["is_male"].astype(np.float32)
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,,
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,,
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,,
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,,


In [6]:
userfe = UserFE(df)

Memory usage of dataframe is 0.38 MB


  0%|          | 0/7 [00:00<?, ?it/s]

Memory usage after optimization is: 0.38 MB
Decreased by 0.0%


### Feature Engineering

In [7]:
df = map_prices(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 925.06 MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["price"][df["price"].isnull()] = df["missing_price"]


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 925.06 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,,
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,,
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,,
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,,


In [8]:
userfe.get_agg(df, target_col="price", agg_name="mean", alias="mean_price")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.49 MB


  0%|          | 0/8 [00:00<?, ?it/s]

Memory usage after optimization is: 0.49 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0


In [9]:
userfe.get_agg(df, target_col="url_host", agg_name="count", alias="actions_number")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.56 MB


  0%|          | 0/9 [00:00<?, ?it/s]

Memory usage after optimization is: 0.51 MB
Decreased by 9.4%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313


In [10]:
userfe.get_agg(df, target_col="request_cnt", agg_name="sum", alias="request_sum")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.58 MB


  0%|          | 0/10 [00:00<?, ?it/s]

Memory usage after optimization is: 0.52 MB
Decreased by 9.1%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441


In [11]:
userfe.get_top_n_mode(df, target_col="url_host", n=3)
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 0.77 MB


  0%|          | 0/13 [00:00<?, ?it/s]

Memory usage after optimization is: 0.77 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,url_host_mode_0,url_host_mode_1,url_host_mode_2
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,apple.com,i.ytimg.com,googleads.g.doubleclick.net
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,vk.com,sun9-27.userapi.com,sun9-32.userapi.com
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,i.ytimg.com,yandex.ru,googleads.g.doubleclick.net
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,i.ytimg.com,googleads.g.doubleclick.net,yandex.ru
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,yandex.ru,googleads.g.doubleclick.net,i.ytimg.com


In [12]:
userfe.get_ratio_part_of_day(df)
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.05 MB


  0%|          | 0/17 [00:00<?, ?it/s]

Memory usage after optimization is: 1.05 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,url_host_mode_0,url_host_mode_1,url_host_mode_2,morning,day,evening,night
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,apple.com,i.ytimg.com,googleads.g.doubleclick.net,0.254839,0.349677,0.287742,0.107742
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,vk.com,sun9-27.userapi.com,sun9-32.userapi.com,0.227147,0.412742,0.357341,0.00277
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,i.ytimg.com,yandex.ru,googleads.g.doubleclick.net,0.211621,0.35841,0.320489,0.10948
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,i.ytimg.com,googleads.g.doubleclick.net,yandex.ru,0.215287,0.350955,0.328025,0.105732
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,yandex.ru,googleads.g.doubleclick.net,i.ytimg.com,0.5623,0.207668,0.204473,0.025559


In [14]:
userfe.get_top_n_mode(df, target_col="region_name", n=3)
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.26 MB


  0%|          | 0/20 [00:00<?, ?it/s]

Memory usage after optimization is: 1.26 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,url_host_mode_0,url_host_mode_1,url_host_mode_2,morning,day,evening,night,region_name_mode_0,region_name_mode_1,region_name_mode_2
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,apple.com,i.ytimg.com,googleads.g.doubleclick.net,0.254839,0.349677,0.287742,0.107742,Краснодарский край,Ставропольский край,Республика Адыгея
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,vk.com,sun9-27.userapi.com,sun9-32.userapi.com,0.227147,0.412742,0.357341,0.00277,Санкт-Петербург,<blank>,<blank>
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,i.ytimg.com,yandex.ru,googleads.g.doubleclick.net,0.211621,0.35841,0.320489,0.10948,Санкт-Петербург,Калининградская область,<blank>
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,i.ytimg.com,googleads.g.doubleclick.net,yandex.ru,0.215287,0.350955,0.328025,0.105732,Краснодарский край,<blank>,<blank>
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,yandex.ru,googleads.g.doubleclick.net,i.ytimg.com,0.5623,0.207668,0.204473,0.025559,Тюменская область,Свердловская область,<blank>


In [15]:
userfe.get_top_n_mode(df, target_col="city_name", n=3)
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.47 MB


  0%|          | 0/23 [00:00<?, ?it/s]

Memory usage after optimization is: 1.47 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,morning,day,evening,night,region_name_mode_0,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,0.254839,0.349677,0.287742,0.107742,Краснодарский край,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,0.227147,0.412742,0.357341,0.00277,Санкт-Петербург,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,0.211621,0.35841,0.320489,0.10948,Санкт-Петербург,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,0.215287,0.350955,0.328025,0.105732,Краснодарский край,<blank>,<blank>,Новороссийск,<blank>,<blank>
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,0.5623,0.207668,0.204473,0.025559,Тюменская область,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>


In [16]:
userfe.get_agg(df, target_col="request_cnt", agg_name="mean", alias="mean_request_cnt")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.54 MB


  0%|          | 0/24 [00:00<?, ?it/s]

Memory usage after optimization is: 1.54 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,day,evening,night,region_name_mode_0,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2,mean_request_cnt
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,0.349677,0.287742,0.107742,Краснодарский край,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск,1.45871
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,0.412742,0.357341,0.00277,Санкт-Петербург,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>,1.781163
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,0.35841,0.320489,0.10948,Санкт-Петербург,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>,2.022018
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,0.350955,0.328025,0.105732,Краснодарский край,<blank>,<blank>,Новороссийск,<blank>,<blank>,1.552229
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,0.207668,0.204473,0.025559,Тюменская область,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>,1.408946


In [17]:
df = part_of_day_to_hour(df)
df = add_hour_to_date(df)
df = get_timestamp(df)
df = get_relative_time(df)
df = pandas_reduce_mem_usage(df)
df

  df[alias] = pd.DatetimeIndex(df[date_col]).astype(int) / scaler


Memory usage of dataframe is 0.14 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 0.14 MB
Decreased by 0.0%
Memory usage of dataframe is 1230.24 MB


  0%|          | 0/1 [00:00<?, ?it/s]

Memory usage after optimization is: 1230.24 MB
Decreased by 0.0%
Memory usage of dataframe is 1230.24 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 1230.24 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,age,is_male,hour,datetime,timestamp,relative_timestamp
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,,,0 days 09:00:00,2022-06-15 09:00:00,165528360.0,388.8
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,,,0 days 09:00:00,2022-06-19 09:00:00,165562920.0,43.2
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,,,0 days 15:00:00,2022-06-12 15:00:00,165504600.0,626.4
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,,,0 days 15:00:00,2022-05-16 15:00:00,165271320.0,2959.2
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,,,0 days 15:00:00,2022-05-30 15:00:00,165392280.0,1749.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,,,0 days 03:00:00,2021-06-30 03:00:00,162502200.0,3088.8
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,,,0 days 15:00:00,2021-07-10 15:00:00,162592920.0,2181.6
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,,,0 days 03:00:00,2021-07-31 03:00:00,162770040.0,410.4
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,,,0 days 03:00:00,2021-06-26 03:00:00,162467640.0,3434.4


In [18]:
userfe.get_timespan(df, date_col="datetime")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.61 MB


  self.df[alias] = (pd.DatetimeIndex(self.df["date_max"]).astype(int) - pd.DatetimeIndex(


  0%|          | 0/25 [00:00<?, ?it/s]

Memory usage after optimization is: 1.61 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,evening,night,region_name_mode_0,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2,mean_request_cnt,timespan
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,0.287742,0.107742,Краснодарский край,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск,1.45871,100872.0
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,0.357341,0.00277,Санкт-Петербург,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>,1.781163,61776.0
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,0.320489,0.10948,Санкт-Петербург,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>,2.022018,43200.0
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,0.328025,0.105732,Краснодарский край,<blank>,<blank>,Новороссийск,<blank>,<blank>,1.552229,100656.0
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,0.204473,0.025559,Тюменская область,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>,1.408946,15120.0


In [19]:
userfe.get_ratio_request_timespan()
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.68 MB


  0%|          | 0/26 [00:00<?, ?it/s]

Memory usage after optimization is: 1.68 MB
Decreased by 0.0%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,night,region_name_mode_0,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2,mean_request_cnt,timespan,ratio_request_timespan
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,0.107742,Краснодарский край,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск,1.45871,100872.0,0.022415
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,0.00277,Санкт-Петербург,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>,1.781163,61776.0,0.020817
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,0.10948,Санкт-Петербург,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>,2.022018,43200.0,0.076528
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,0.105732,Краснодарский край,<blank>,<blank>,Новороссийск,<blank>,<blank>,1.552229,100656.0,0.024211
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,0.025559,Тюменская область,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>,1.408946,15120.0,0.029167


In [20]:
userfe.get_agg(df, target_col="region_name", agg_name="nunique", alias="nunique_region_name")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.75 MB


  0%|          | 0/27 [00:00<?, ?it/s]

Memory usage after optimization is: 1.69 MB
Decreased by 3.5%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,region_name_mode_0,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2,mean_request_cnt,timespan,ratio_request_timespan,nunique_region_name
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,Краснодарский край,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск,1.45871,100872.0,0.022415,3
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,Санкт-Петербург,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>,1.781163,61776.0,0.020817,1
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,Санкт-Петербург,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>,2.022018,43200.0,0.076528,2
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,Краснодарский край,<blank>,<blank>,Новороссийск,<blank>,<blank>,1.552229,100656.0,0.024211,1
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,Тюменская область,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>,1.408946,15120.0,0.029167,2


In [21]:
userfe.get_agg(df, target_col="city_name", agg_name="nunique", alias="nunique_city_name")
userfe.pandas_reduce_mem_usage()
userfe.df.head()

Memory usage of dataframe is 1.76 MB


  0%|          | 0/28 [00:00<?, ?it/s]

Memory usage after optimization is: 1.69 MB
Decreased by 3.5%


Unnamed: 0,cpe_manufacturer_name,cpe_model_name,cpe_type_cd,cpe_model_os_type,user_id,age,is_male,mean_price,actions_number,request_sum,...,region_name_mode_1,region_name_mode_2,city_name_mode_0,city_name_mode_1,city_name_mode_2,mean_request_cnt,timespan,ratio_request_timespan,nunique_region_name,nunique_city_name
0,Apple,iPhone 7,smartphone,iOS,45098,,,20368.0,1550,2261,...,Ставропольский край,Республика Адыгея,Краснодар,Ставрополь,Адыгейск,1.45871,100872.0,0.022415,3,5
1,Xiaomi,Redmi 5 Plus,smartphone,Android,117132,,,4990.0,722,1286,...,<blank>,<blank>,Санкт-Петербург,<blank>,<blank>,1.781163,61776.0,0.020817,1,1
2,Samsung,Galaxy S20+,smartphone,Android,79395,35.0,1.0,74259.0,1635,3306,...,Калининградская область,<blank>,Санкт-Петербург,Калининград,<blank>,2.022018,43200.0,0.076528,2,2
3,Xiaomi,Poco X3 Pro,smartphone,Android,91294,35.0,1.0,23876.0,1570,2437,...,<blank>,<blank>,Новороссийск,<blank>,<blank>,1.552229,100656.0,0.024211,1,1
4,Xiaomi,Redmi Note 8 Pro,smartphone,Android,161323,39.0,0.0,20465.0,313,441,...,Свердловская область,<blank>,Тюмень,Екатеринбург,<blank>,1.408946,15120.0,0.029167,2,2


In [22]:
userfe.df.corr()

  userfe.df.corr()


Unnamed: 0,age,is_male,mean_price,actions_number,request_sum,morning,day,evening,night,mean_request_cnt,timespan,ratio_request_timespan,nunique_region_name,nunique_city_name
age,1.0,-0.115143,-0.252101,-0.105286,-0.101791,0.185748,0.053993,-0.151677,-0.105853,-0.038662,-0.021835,-0.117887,-0.014221,-0.047653
is_male,-0.115143,1.0,-0.026682,0.019453,0.016694,0.017593,-0.064264,-0.042245,0.109827,-0.027617,0.018662,-0.002222,0.07462,0.090546
mean_price,-0.252101,-0.026682,1.0,0.098581,0.078263,-0.150833,0.07322,0.094768,-0.026713,-0.021608,0.079696,0.037508,0.126072,0.133555
actions_number,-0.105286,0.019453,0.098581,1.0,0.98118,-0.011776,-0.128876,0.049792,0.116854,0.209663,0.606432,0.550275,0.107878,0.144796
request_sum,-0.101791,0.016694,0.078263,0.98118,1.0,-0.014566,-0.14123,0.044825,0.14168,0.326319,0.555665,0.606007,0.087956,0.120852
morning,0.185748,0.017593,-0.150833,-0.011776,-0.014566,1.0,-0.099312,-0.584248,-0.362327,-0.020471,0.016091,-0.055027,-0.042251,-0.022502
day,0.053993,-0.064264,0.07322,-0.128876,-0.14123,-0.099312,1.0,-0.488151,-0.558664,-0.159809,0.06377,-0.217604,-0.001531,0.004665
evening,-0.151677,-0.042245,0.094768,0.049792,0.044825,-0.584248,-0.488151,1.0,0.112118,0.007265,-0.021868,0.078126,0.022914,0.011073
night,-0.105853,0.109827,-0.026713,0.116854,0.14168,-0.362327,-0.558664,0.112118,1.0,0.217045,-0.073314,0.246119,0.024648,0.007572
mean_request_cnt,-0.038662,-0.027617,-0.021608,0.209663,0.326319,-0.020471,-0.159809,0.007265,0.217045,1.0,-0.074458,0.53666,-0.06965,-0.065373


In [6]:
df = clean_os_type(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 848.77 MB


  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage after optimization is: 848.77 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098
...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799


In [7]:
df = get_agg_count(df, target_col="url_host")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 925.06 MB


  0%|          | 0/13 [00:00<?, ?it/s]

Memory usage after optimization is: 867.84 MB
Decreased by 6.2%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381


In [8]:
df = get_agg_sum(df, agg_col="user_id", target_col="request_cnt", alias="user_id_request_sum")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 944.14 MB


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 886.92 MB
Decreased by 6.1%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008


#### Checkpoint 1

In [9]:
df.to_parquet("../checkpoint_1.parquet")

In [4]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("../checkpoint_1.parquet")
)
df

Memory usage of dataframe is 886.92 MB


  0%|          | 0/14 [00:00<?, ?it/s]

Memory usage after optimization is: 886.92 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008


In [5]:
df = get_agg_mean(df, target_col="price", alias="user_id_mean_price")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 925.06 MB


  0%|          | 0/15 [00:00<?, ?it/s]

Memory usage after optimization is: 925.06 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0


In [6]:
df = get_top_n_mode(df, target_col="url_host", n=3)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1192.09 MB


  0%|          | 0/18 [00:00<?, ?it/s]

Memory usage after optimization is: 1192.09 MB
Decreased by 0.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net


In [7]:
df = part_of_day_to_hour(df, return_dtype="int", alias="hour_int")
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1268.39 MB


  0%|          | 0/19 [00:00<?, ?it/s]

Memory usage after optimization is: 1201.63 MB
Decreased by 5.3%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3


In [8]:
%%time
df = mean_first_visit(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1277.92 MB


  0%|          | 0/20 [00:00<?, ?it/s]

Memory usage after optimization is: 1239.78 MB
Decreased by 3.0%
CPU times: total: 28.6 s
Wall time: 28.9 s


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,1,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594


In [9]:
%%time
df = mean_last_visit(df)
df = pandas_reduce_mem_usage(df)
df

Memory usage of dataframe is 1316.07 MB


  0%|          | 0/21 [00:00<?, ?it/s]

Memory usage after optimization is: 1277.92 MB
Decreased by 2.9%
CPU times: total: 29.1 s
Wall time: 29.2 s


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv,mean_lv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594,21.000000
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000


#### Checkpoint 2

In [10]:
df.to_parquet("../checkpoint_2.parquet")

In [4]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("../checkpoint_2.parquet")
)
df

Memory usage of dataframe is 1277.92 MB


  0%|          | 0/21 [00:00<?, ?it/s]

Memory usage after optimization is: 1239.78 MB
Decreased by 3.0%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,user_id,user_id_group_count,user_id_request_sum,user_id_mean_price,user_id_mode_0,user_id_mode_1,user_id_mode_2,hour_int,mean_fv,mean_lv
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,9,5.334194,20.593548
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,...,45098,1550,2261,20368.0,apple.com,i.ytimg.com,googleads.g.doubleclick.net,15,5.334194,20.593548
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,googleads.g.doubleclick.net,smartphone,Android,12544.0,2021-06-30,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999996,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-10,day,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,15,4.759594,21.000000
9999997,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,vk.com,smartphone,Android,12544.0,2021-07-31,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
9999998,Ростовская область,Таганрог,Xiaomi,Redmi Note 8T,yandex.ru,smartphone,Android,12544.0,2021-06-26,night,...,23799,1381,3008,12544.0,ad.mail.ru,yandex.ru,yastatic.net,3,4.759594,21.000000
