# Train Pytorch Regression on Mercari Data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)

In [3]:
#keras tokenizer
from keras.preprocessing import text
from keras.preprocessing import sequence # for import pad_sequences

In [4]:
#pytorch tokenizer
import torch
import torch.nn as nn
import torchtext

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.callbacks import ModelSummary, LearningRateMonitor

from sklearn import model_selection

In [5]:
import joblib

In [6]:
from IPython.core.debugger import set_trace

In [7]:
import socket
import re

In [8]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

#https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility
os.environ["UBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [9]:
import gc
import random
import numpy as np

is_cuda = torch.cuda.is_available()


if is_cuda:
    print(is_cuda)
    print(torch.cuda.current_device())
    print(torch.cuda.device_count())
    print(torch.cuda.get_device_name(0))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
else:
    device = torch.device("cpu")

    
print('Using device:', device)   
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


True
0
1
Quadro RTX 4000
Using device: cuda


In [10]:
import sys
sys.path.append("./prediction/")

import pytorch_dataset as pytorch_dataset
import pytorch_model as pytorch_model
import utils as utils

In [11]:
model_dump_path = "./model"

In [12]:
train = pd.read_parquet("./data/processed/train_processed.parquet")
print(f"train len: {len(train)}")

#test = pd.read_parquet("./data/processed/test_processed.parquet")
#print(f"test len: {len(test)}")

#create dummy variable price for test set
#test["price"] = train["price"].min()


train.insert(len(train.columns)-1, 'price', train.pop('price'))
train.rename(columns = {"train_id": "id"}, inplace=True)

#test.insert(len(test.columns)-1, 'price', test.pop('price'))
#test.rename(columns = {"test_id": "id"}, inplace=True)

train["price_log"] = np.log1p(train["price"])
#test["price_log"] = np.log1p(test["price"])

train len: 1481661


## Sample From Train

In [13]:
#train = train.sample(frac = 0.1)
#train.shape

# Mercari Price Prediction Pytorch

In [14]:
import category_encoders as ce
import feature_engine.encoding as fe
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer

from sklearn.pipeline import Pipeline

import sklearn.metrics as metrics

In [15]:
columns_numerical = ["shipping", "desc_len", "name_len", "is_brand_missing", "is_item_description_missing"]

encoder_numerical = Pipeline([
    ("selector", ColumnTransformer([("selector", "passthrough", columns_numerical)], remainder="drop")), 
    ("normalizer", StandardScaler())
])

columns_categorical = ["item_condition_id", "brand_name", "subcategory_1", "subcategory_2", "subcategory_3"]


columns_text = ["name", "item_description"]
columns_char = ["name", "item_description"]
target = "price_log"
is_target_log = True

In [16]:
for num in columns_numerical:
    print(num)
    if train[num].dtype != np.float32:
        print(f"converting {num} to float32")
        train[num] = train[num].astype(np.float32)
        
        
#for num in columns_numerical:
#    print(num)
#    if test[num].dtype != np.float32:
#        print(f"converting {num} to float32")
#        test[num] = test[num].astype(np.float32)

shipping
converting shipping to float32
desc_len
converting desc_len to float32
name_len
converting name_len to float32
is_brand_missing
converting is_brand_missing to float32
is_item_description_missing
converting is_item_description_missing to float32


## Pytorch Lightning

In [17]:
X_train, X_validation, y_train, y_validation = model_selection.train_test_split(train, train[target].values, test_size=0.20, random_state=42)
dd_train, dd_validation = pytorch_dataset.build_pytorch_dataset(X_train, 
                                         X_validation, 
                                         encoder_numerical = encoder_numerical, 
                                         categorical_names = columns_categorical,
                                         text_names = columns_text, 
                                         char_names = columns_char, 
                                         target_name = target, 
                                         encoder_target = PowerTransformer(method="box-cox"), 
                                         verbose = True)

target: price_log
train: (1185328, 16)
test: (296333, 16)
train set mode
=> target encoding
=> numerical encoding
=> categorical encoding
=> tokenizing name
==> name vocabulary size 104570 
=> tokenizing item_description
==> item_description vocabulary size 184473 
=> tokenizing chars name
==> name vocabulary size 455 
=> tokenizing chars item_description
==> item_description vocabulary size 995 
target min, max range (-2.7831687470788093, 4.21176858471614)
test set mode
=> target encoding
=> numerical encoding
=> categorical encoding
name vocabulary size 104570
item_description vocabulary size 184473
name vocabulary size 455
item_description vocabulary size 995
target min, max range (-2.7831687470788093, 4.21270471801917)


In [18]:
len(dd_train)

1185328

In [19]:
len(dd_validation)

296333

In [20]:
batch_size = 1024
train_loader = DataLoader(dd_train, shuffle = True, batch_size = batch_size, collate_fn = pytorch_dataset.pytorch_collate_fn)
validation_loader = DataLoader(dd_validation, shuffle = False, batch_size = batch_size, collate_fn = pytorch_dataset.pytorch_collate_fn)

In [21]:
gc.collect()
torch.cuda.empty_cache()

metric_to_monitor = "rmsle"

model = pytorch_model.PytorchModel(target_encoder = dd_train.get_encoder_target(), 
                                            is_target_log = is_target_log, 
                                            optimizer = "Adam",
                                            metric_to_monitor = metric_to_monitor,
                                            numerical_input_size=dd_train.get_data_numerical().shape[1], 
                                            numerical_batch_normalization = True, 
                                            categorical_embedding_size=dd_train.get_data_categorical_embedding_sizes(), 
                                            categorical_embedding_dropout = 0.4,
                                            text_as_embedding_bag = False, 
                                            text_as_embedding_bag_mode = "mean", 
                                            text_vocabulary_size = dd_train.get_text_vocabulary_size(), 
                                            text_embedding_dimension = 50, 
                                            text_bidirectional = True, 
                                            text_recurrent_hidden_size = 100, 
                                            text_recurrent_layers = 2, 
                                            text_rnn = "GRU",
                                            char_vocabulary_size = dd_train.get_char_vocabulary_size(),
                                            char_embedding_dimension = 40, 
                                            char_bidirectional = False, 
                                            char_recurrent_hidden_size = 50, 
                                            char_recurrent_layers = 1, 
                                            char_rnn = "LSTM",
                                            linear_layer_skip_connections = (3, ([1024], [0.3])),
                                            linear_layers = ([512], [0.2]),
                                            linear_layer_normalization = "BatchNorm1d",
                                            normalization_before_activation = True, 
                                            linear_layer_activation = nn.ReLU(inplace=True),
                                            final_linear_layer=True, 
                                            final_normalization = False, 
                                            loss_function = nn.MSELoss(),
                                            learning_rate = 0.001, 
                                            verbose = True
                              )
model

"categorical_embedding_dropout":   0.4
"categorical_embedding_size":      [(6, 4), (4522, 178), (12, 6), (115, 23), (866, 71)]
"char_bidirectional":              False
"char_embedding_dimension":        40
"char_recurrent_hidden_size":      50
"char_recurrent_layers":           1
"char_rnn":                        LSTM
"char_vocabulary_size":            {'name': 455, 'item_description': 995}
"final_linear_layer":              True
"final_normalization":             False
"is_target_log":                   True
"learning_rate":                   0.001
"linear_layer_activation":         ReLU(inplace=True)
"linear_layer_normalization":      BatchNorm1d
"linear_layer_skip_connections":   (3, ([1024], [0.3]))
"linear_layers":                   ([512], [0.2])
"loss_function":                   MSELoss()
"metric_to_monitor":               rmsle
"normalization_before_activation": True
"numerical_batch_normalization":   True
"numerical_input_size":            5
"optimizer":                     

PytorchModel(
  (metric): MeanSquaredError()
  (loss_function): MSELoss()
  (embeds): ModuleList(
    (0): Embedding(6, 4, padding_idx=0)
    (1): Embedding(4522, 178, padding_idx=0)
    (2): Embedding(12, 6, padding_idx=0)
    (3): Embedding(115, 23, padding_idx=0)
    (4): Embedding(866, 71, padding_idx=0)
  )
  (categorical_dropout): Dropout(p=0.4, inplace=False)
  (batch_normalization_numerical): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (text_embeddings): ModuleList(
    (0): TextRecurrentLayer(
      (embedding): Embedding(104570, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
    (1): TextRecurrentLayer(
      (embedding): Embedding(184473, 50, padding_idx=0)
      (rnn): GRU(50, 100, num_layers=2, batch_first=True, bidirectional=True)
      (last_time_step): LastTimeStep()
    )
  )
  (char_embeddings): ModuleList(
    (0): TextRecurrentLayer(
  

In [22]:
early_stop_callback = EarlyStopping(monitor=metric_to_monitor, min_delta=0.00, patience=5, mode="min", verbose = True)

checkpoint_filename = "epoch{epoch:02d}-loss{loss:.2f}-val_loss{val_loss:.2f}-rmsle{rmsle:.3f}"
model_checkpoint_callback = ModelCheckpoint(monitor=metric_to_monitor, filename=checkpoint_filename, 
                                            auto_insert_metric_name=False, 
                                            dirpath=model_dump_path, 
                                            save_weights_only = False,
                                            verbose = True)


epochs = 10
enable_model_summary = False

print(f"epochs: {epochs}")

#limit_train_batches=0.1
trainer = pl.Trainer(precision=16, 
                     auto_select_gpus = True, 
                     gpus=1, 
                     enable_checkpointing = True, 
                     check_val_every_n_epoch  = 1, 
                     max_epochs=epochs, 
                     enable_model_summary = enable_model_summary, 
                     default_root_dir = "./", 
                     enable_progress_bar = True, 
                     deterministic = False, 
                     callbacks=[model_checkpoint_callback])
 


epochs: 10


Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


### Fit model in Lightning

In [None]:
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=validation_loader)

## Save Train Dataset

In [24]:
model_checkpoint = os.path.basename(model_checkpoint_callback.best_model_path)
model_path = os.path.splitext(model_checkpoint)[0]

In [25]:
joblib.dump({'train_set': X_train, 'dd_train': dd_train, 'validation_set': X_validation, 'model': model_checkpoint}, filename = os.path.join(model_dump_path, f"data_dump_{model_path}.dump"))

['./model\\data_dump_epoch07-loss0.24-val_loss0.34-rmsle0.429.dump']

### Predict

In [28]:
len(train)

1481661

In [29]:
train.head()

Unnamed: 0,id,name,item_condition_id,category_name,brand_name,shipping,item_description,desc_len,name_len,subcategory_1,subcategory_2,subcategory_3,is_brand_missing,is_item_description_missing,price,price_log
0,0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,missing,1.0,missing,0.0,7.0,men,tops,t-shirts,1.0,1.0,10.0,2.397895
1,1,razer blackwidow chroma keyboard,3,Electronics/Computers & Tablets/Components & P...,razer,0.0,this keyboard is in great condition and works ...,36.0,4.0,electronics,computers & tablets,components & parts,0.0,0.0,52.0,3.970292
2,2,ava-viv blouse,1,Women/Tops & Blouses/Blouse,target,1.0,adorable top with a hint of lace and a key hol...,29.0,2.0,women,tops & blouses,blouse,0.0,0.0,10.0,2.397895
3,3,leather horse statues,1,Home/Home Décor/Home Décor Accents,missing,1.0,new with tags. leather horses. retail for [rm]...,32.0,3.0,home,home décor,home décor accents,1.0,0.0,35.0,3.583519
4,4,24k gold plated rose,1,Women/Jewelry/Necklaces,missing,0.0,complete with certificate of authenticity,5.0,4.0,women,jewelry,necklaces,1.0,0.0,44.0,3.806662


In [31]:
train.brand_name.unique()

array(['missing', 'razer', 'target', ..., 'astroglide', 'cumberland bay',
       'kids only'], dtype=object)

In [32]:
train.brand_name.nunique()

4806