In [2]:
import gc
import sys
import os
import warnings

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

In [4]:
import importlib

import modules
import fe_modules

importlib.reload(modules)
importlib.reload(fe_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset, get_suitable_for_parquet
from modules.sanity_checks import null_check
from fe_modules.text_manipulation import get_domain
from fe_modules.datetime_features import get_timestamp, \
                                         get_relative_time, \
                                         part_of_day_to_hour, \
                                         add_hour_to_date, \
                                         get_year, \
                                         get_month, \
                                         get_day, \
                                         get_day_of_year, \
                                         get_day_of_week, \
                                         get_holiday_name
from fe_modules.preprocessing import clean_os_type
from fe_modules.missing import map_prices
from fe_modules.geo_features import map_cities, dist_to_large_cities

# Load data

In [4]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [5]:
df = pandas_reduce_mem_usage( 
        pd.read_parquet(
            f'{LOCAL_DATA_PATH}competition_data_final_pqt/'
    ))
df.head()

Memory usage of dataframe is 29562.33 MB


100%|███████████████████████████████████████████| 12/12 [00:03<00:00,  3.06it/s]

Memory usage after optimization is: 24943.21 MB
Decreased by 15.6%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


# Feature engeneering

## Cleaning

In [7]:
df = clean_os_type(df)
df.head()

TypeError: Series.map() got an unexpected keyword argument 'inplace'

## Feature generation

In [7]:
df = pandas_reduce_mem_usage(map_prices(df))
df.head()

Memory usage of dataframe is 27406.74 MB


100%|███████████████████████████████████████████| 12/12 [00:01<00:00, 10.31it/s]

Memory usage after optimization is: 27406.74 MB
Decreased by 0.0%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098


In [8]:
df = get_year(df)
df = get_month(df)
df = get_day(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 34797.32 MB


100%|███████████████████████████████████████████| 15/15 [00:02<00:00,  6.75it/s]

Memory usage after optimization is: 28638.50 MB
Decreased by 17.7%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day
0,Краснодарский край,Краснодар,Apple,iPhone 7,ad.adriver.ru,smartphone,iOS,20368.0,2022-06-15,morning,1,45098,2022,6,15
1,Краснодарский край,Краснодар,Apple,iPhone 7,apple.com,smartphone,iOS,20368.0,2022-06-19,morning,1,45098,2022,6,19
2,Краснодарский край,Краснодар,Apple,iPhone 7,avatars.mds.yandex.net,smartphone,iOS,20368.0,2022-06-12,day,1,45098,2022,6,12
3,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-16,day,1,45098,2022,5,16
4,Краснодарский край,Краснодар,Apple,iPhone 7,googleads.g.doubleclick.net,smartphone,iOS,20368.0,2022-05-30,day,1,45098,2022,5,30


In [9]:
df = pandas_string_to_cat(df, 
                            ["region_name", 
                             "year",
                             "cpe_manufacturer_name", 
                             "cpe_type_cd", 
                             "cpe_model_os_type",
                            ]
                           )
df.head()

Memory usage of dataframe is 28638.50 MB
Memory usage of dataframe is 27.97 GB


100%|█████████████████████████████████████████████| 5/5 [00:55<00:00, 11.16s/it]

Memory usage after optimization is: 19708.22 MB
Decreased by 31.2%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30


In [10]:
df.dtypes

region_name                uint8
city_name                 object
cpe_manufacturer_name      uint8
cpe_model_name            object
url_host                  object
cpe_type_cd                uint8
cpe_model_os_type          uint8
price                    float32
date                      object
part_of_day               object
request_cnt                 int8
user_id                    int32
year                       uint8
month                       int8
day                         int8
dtype: object

In [11]:
df.to_parquet("seq2seq_data/stages/stage_1.parquet.gzip",
              compression='gzip')

In [4]:
df = pandas_reduce_mem_usage(pd.read_parquet("seq2seq_data/stages/stage_1.parquet.gzip"))

Memory usage of dataframe is 19708.22 MB


100%|███████████████████████████████████████████| 15/15 [00:02<00:00,  6.57it/s]

Memory usage after optimization is: 19708.22 MB
Decreased by 0.0%





In [5]:
df = get_day_of_year(df)
df = get_day_of_week(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 24635.27 MB


100%|███████████████████████████████████████████| 17/17 [00:02<00:00,  8.06it/s]

Memory usage after optimization is: 20632.04 MB
Decreased by 16.2%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0


In [6]:
df = get_holiday_name(df)
df = pandas_reduce_mem_usage(df)
df.head()

Memory usage of dataframe is 23095.57 MB


100%|███████████████████████████████████████████| 18/18 [00:01<00:00, 10.70it/s]

Memory usage after optimization is: 23095.57 MB
Decreased by 0.0%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,Не праздник
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,Не праздник
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,День России
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,Не праздник
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,Не праздник


In [7]:
df = pandas_string_to_cat(df, 
                            [
                             "holiday"
                            ]
                           )

df.head()

Memory usage of dataframe is 23095.57 MB
Memory usage of dataframe is 22.55 GB


100%|█████████████████████████████████████████████| 1/1 [00:11<00:00, 11.93s/it]

Memory usage after optimization is: 20939.98 MB
Decreased by 9.3%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,request_cnt,user_id,year,month,day,day_of_year,day_of_week,holiday
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,1,45098,1,6,15,166,2,5
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,1,45098,1,6,19,170,6,5
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,1,45098,1,6,12,163,6,1
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,1,45098,1,5,16,136,0,5
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,1,45098,1,5,30,150,0,5


In [9]:
df.to_parquet("seq2seq_data/stages/stage_2.parquet.gzip",
              compression='gzip')

In [4]:
df = pandas_reduce_mem_usage(pd.read_parquet("seq2seq_data/stages/stage_2.parquet.gzip"))

Memory usage of dataframe is 20939.98 MB


100%|███████████████████████████████████████████| 18/18 [00:02<00:00,  6.58it/s]

Memory usage after optimization is: 20939.98 MB
Decreased by 0.0%





In [5]:
df = map_cities(df)
df.head()

Memory usage of dataframe is 0.05 MB


100%|███████████████████████████████████████████| 6/6 [00:00<00:00, 5337.40it/s]


Memory usage after optimization is: 0.03 MB
Decreased by 47.8%


Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,month,day,day_of_year,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population
0,21,Краснодар,1,iPhone 7,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,...,6,15,166,2,5,2.0,3,45.040161,38.975964,744933
1,21,Краснодар,1,iPhone 7,apple.com,2,1,20368.0,2022-06-19,morning,...,6,19,170,6,5,2.0,3,45.040161,38.975964,744933
2,21,Краснодар,1,iPhone 7,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,...,6,12,163,6,1,2.0,3,45.040161,38.975964,744933
3,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,...,5,16,136,0,5,2.0,3,45.040161,38.975964,744933
4,21,Краснодар,1,iPhone 7,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,...,5,30,150,0,5,2.0,3,45.040161,38.975964,744933


In [6]:
df = pandas_string_to_cat(df, 
                            [
                            "cpe_model_name",
                            "city_name", 
                            ]
                           )
df.head()

Memory usage of dataframe is 26659.76 MB
Memory usage of dataframe is 26.03 GB


100%|█████████████████████████████████████████████| 2/2 [00:22<00:00, 11.39s/it]

Memory usage after optimization is: 22896.03 MB
Decreased by 14.1%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,month,day,day_of_year,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,...,6,15,166,2,5,2.0,3,45.040161,38.975964,744933
1,21,409,1,589,apple.com,2,1,20368.0,2022-06-19,morning,...,6,19,170,6,5,2.0,3,45.040161,38.975964,744933
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,...,6,12,163,6,1,2.0,3,45.040161,38.975964,744933
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,...,5,16,136,0,5,2.0,3,45.040161,38.975964,744933
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,...,5,30,150,0,5,2.0,3,45.040161,38.975964,744933


In [7]:
df = get_domain(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,day,day_of_year,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain
0,21,409,1,589,ad.adriver.ru,2,1,20368.0,2022-06-15,morning,...,15,166,2,5,2.0,3,45.040161,38.975964,744933,ru
1,21,409,1,589,apple.com,2,1,20368.0,2022-06-19,morning,...,19,170,6,5,2.0,3,45.040161,38.975964,744933,com
2,21,409,1,589,avatars.mds.yandex.net,2,1,20368.0,2022-06-12,day,...,12,163,6,1,2.0,3,45.040161,38.975964,744933,net
3,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-16,day,...,16,136,0,5,2.0,3,45.040161,38.975964,744933,net
4,21,409,1,589,googleads.g.doubleclick.net,2,1,20368.0,2022-05-30,day,...,30,150,0,5,2.0,3,45.040161,38.975964,744933,net


In [8]:
df = pandas_string_to_cat(df, 
                            [
                            "domain",
                            "url_host"
                            ]
                           )
df.head()

Memory usage of dataframe is 25405.19 MB
Memory usage of dataframe is 24.81 GB


100%|█████████████████████████████████████████████| 2/2 [00:23<00:00, 11.96s/it]

Memory usage after optimization is: 22268.74 MB
Decreased by 12.3%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,day,day_of_year,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,...,15,166,2,5,2.0,3,45.040161,38.975964,744933,761
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,...,19,170,6,5,2.0,3,45.040161,38.975964,744933,549
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,...,12,163,6,1,2.0,3,45.040161,38.975964,744933,712
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,...,16,136,0,5,2.0,3,45.040161,38.975964,744933,712
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,...,30,150,0,5,2.0,3,45.040161,38.975964,744933,712


In [9]:
df = pandas_reduce_mem_usage(get_timestamp(df))
df.head()

Memory usage of dataframe is 24777.90 MB


100%|███████████████████████████████████████████| 25/25 [00:05<00:00,  4.66it/s]

Memory usage after optimization is: 23523.32 MB
Decreased by 5.1%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,day_of_year,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,...,166,2,5,2.0,3,45.040161,38.975964,744933,761,165525120.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,...,170,6,5,2.0,3,45.040161,38.975964,744933,549,165559680.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,...,163,6,1,2.0,3,45.040161,38.975964,744933,712,165499200.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,...,136,0,5,2.0,3,45.040161,38.975964,744933,712,165265920.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,...,150,0,5,2.0,3,45.040161,38.975964,744933,712,165386880.0


In [10]:
df.to_parquet("seq2seq_data/stages/stage_3.parquet.gzip",
              compression='gzip')

In [5]:
df = pandas_reduce_mem_usage(pd.read_parquet("seq2seq_data/stages/stage_3.parquet.gzip"))

Memory usage of dataframe is 23523.32 MB


100%|███████████████████████████████████████████| 25/25 [00:07<00:00,  3.14it/s]

Memory usage after optimization is: 23523.32 MB
Decreased by 0.0%





In [6]:
df = part_of_day_to_hour(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,hour
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,...,2,5,2.0,3,45.040161,38.975964,744933,761,165525120.0,0 days 09:00:00
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,...,6,5,2.0,3,45.040161,38.975964,744933,549,165559680.0,0 days 09:00:00
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,...,6,1,2.0,3,45.040161,38.975964,744933,712,165499200.0,0 days 15:00:00
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,...,0,5,2.0,3,45.040161,38.975964,744933,712,165265920.0,0 days 15:00:00
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,...,0,5,2.0,3,45.040161,38.975964,744933,712,165386880.0,0 days 15:00:00


In [7]:
df = add_hour_to_date(df)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,hour,datetime
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,...,5,2.0,3,45.040161,38.975964,744933,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,...,5,2.0,3,45.040161,38.975964,744933,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,...,1,2.0,3,45.040161,38.975964,744933,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,...,5,2.0,3,45.040161,38.975964,744933,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,...,5,2.0,3,45.040161,38.975964,744933,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00


In [8]:
df = get_relative_time(df, return_dtype="timestamp")
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,hour,datetime,relative_date
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,morning,...,2.0,3,45.040161,38.975964,744933,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00,3888.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,morning,...,2.0,3,45.040161,38.975964,744933,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00,432.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,day,...,2.0,3,45.040161,38.975964,744933,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00,6264.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,day,...,2.0,3,45.040161,38.975964,744933,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00,29592.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,day,...,2.0,3,45.040161,38.975964,744933,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00,17496.0


In [9]:
df = pandas_string_to_cat(df, 
                            [
                            "part_of_day"
                            ]
                           )
df.head()

Memory usage of dataframe is 31050.78 MB
Memory usage of dataframe is 30.32 GB


100%|█████████████████████████████████████████████| 1/1 [00:09<00:00,  9.96s/it]

Memory usage after optimization is: 28855.27 MB
Decreased by 7.1%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,date,part_of_day,...,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,hour,datetime,relative_date
0,21,409,1,589,5788,2,1,20368.0,2022-06-15,2,...,2.0,3,45.040161,38.975964,744933,761,165525120.0,0 days 09:00:00,2022-06-15 09:00:00,3888.0
1,21,409,1,589,12900,2,1,20368.0,2022-06-19,2,...,2.0,3,45.040161,38.975964,744933,549,165559680.0,0 days 09:00:00,2022-06-19 09:00:00,432.0
2,21,409,1,589,17626,2,1,20368.0,2022-06-12,0,...,2.0,3,45.040161,38.975964,744933,712,165499200.0,0 days 15:00:00,2022-06-12 15:00:00,6264.0
3,21,409,1,589,59366,2,1,20368.0,2022-05-16,0,...,2.0,3,45.040161,38.975964,744933,712,165265920.0,0 days 15:00:00,2022-05-16 15:00:00,29592.0
4,21,409,1,589,59366,2,1,20368.0,2022-05-30,0,...,2.0,3,45.040161,38.975964,744933,712,165386880.0,0 days 15:00:00,2022-05-30 15:00:00,17496.0


In [10]:
df = df.drop(labels=["date", "hour", "datetime"], axis=1)
df.head()

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,...,day_of_week,holiday,capital_marker,timezone,geo_lat,geo_lon,population,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,...,2,5,2.0,3,45.040161,38.975964,744933,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,...,6,5,2.0,3,45.040161,38.975964,744933,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,...,6,1,2.0,3,45.040161,38.975964,744933,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,...,0,5,2.0,3,45.040161,38.975964,744933,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,...,0,5,2.0,3,45.040161,38.975964,744933,712,165386880.0,17496.0


## Save

In [11]:
df.to_parquet("seq2seq_data/stages/stage_4.parquet.gzip",
              compression='gzip')

In [21]:
df.shape

(322899435, 14)