# seq2seq

In [3]:
import gc
import sys
import os
import warnings
from tqdm import tqdm

sys.path.append(os.path.abspath("modules/"))
sys.path.append(os.path.abspath("fe_modules/"))
sys.path.append(os.path.abspath("seq2seq_modules/"))

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')
tqdm.pandas()

In [4]:
import pandas as pd
import numpy as np
import time
import polars as pl
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight

import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn

from transformers import get_constant_schedule

In [5]:
import importlib

import modules
import fe_modules
import seq2seq_modules

importlib.reload(modules)
importlib.reload(fe_modules)
importlib.reload(seq2seq_modules)

from modules.memory_utils import pandas_reduce_mem_usage, pandas_string_to_cat, my_reset
from seq2seq_modules.preprocessing import PandasPreprocessor
from seq2seq_modules.models import LSTMModel
from seq2seq_modules.weight_initialization import weights_init_uniform_rule
from seq2seq_modules.loops import cross_validation, single_model_training
from seq2seq_modules.utils import age_bucket
from seq2seq_modules.metrics import AGE_METRIC

## Read and process

In [6]:
LOCAL_DATA_PATH = './data/'
SPLIT_SEED = 42

In [7]:
ids = ["user_id"]

cat_features = [
    "region_name",
    "city_name",
    "cpe_manufacturer_name",
    "cpe_model_name",
    "url_host",
    "cpe_type_cd",
    "cpe_model_os_type",
    "part_of_day",
    "request_cnt",
    "domain"
]

continous_features = [
    "price",
    "timestamp",
    "relative_date"
]

In [6]:
df = pandas_reduce_mem_usage(
    pd.read_parquet("seq2seq_data/version_1.parquet.gzip")
)
df.head()

Memory usage of dataframe is 13549.40 MB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:03<00:00,  3.91it/s]

Memory usage after optimization is: 12317.64 MB
Decreased by 9.1%





Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,price,part_of_day,request_cnt,user_id,domain,timestamp,relative_date
0,21,409,1,589,5788,2,1,20368.0,2,1,45098,761,165525120.0,3888.0
1,21,409,1,589,12900,2,1,20368.0,2,1,45098,549,165559680.0,432.0
2,21,409,1,589,17626,2,1,20368.0,0,1,45098,712,165499200.0,6264.0
3,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165265920.0,29592.0
4,21,409,1,589,59366,2,1,20368.0,0,1,45098,712,165386880.0,17496.0
