In [1]:
import sys
sys.path.append("../../..")

In [3]:
import torch
import numpy as np

from dltranz.trx_encoder import PaddedBatch
from dltranz.seq_encoder.statistics_encoder import StatisticsEncoder


def get_data():
    payload = {'amount': torch.arange(4*10).view(4, 10).float(),
               'event_time': torch.arange(4*10).view(4, 10).float(),
               'mcc_code': torch.arange(4*10).view(4, 10),
               'tr_type': torch.arange(4*10).view(4, 10)
              }
    return PaddedBatch(
                       payload=payload,
                       length=torch.tensor([4, 2, 6, 8])
                      )

def test_shape():
    config = {
        'top_negative_trx': list(range(0, 20)),
        'top_positive_trx': list(range(20, 40)),       
        'category_names': ['mcc_code', 'tr_type'],
              'numeric_values': ['amount'],
              'category_max_size' : {
                  'mcc_code': 200,
                  'tr_type': 100
              }
    }

    eps = 1e-2

    model = StatisticsEncoder(config)

    x = get_data()

    out = model(x)
    
    assert isinstance(out, tuple) and len(out) == 4
    assert (abs(out[0] -  torch.Tensor([[-16.1181],
                                        [-16.1181],
                                        [-16.1181],
                                        [-16.1181]])) < torch.zeros((4, 1)) + eps).all()
    assert out[1].shape == torch.Size([4, 6]) and out[1][1][4] == 0 and out[1][3][1] == 0
    assert out[2].shape == torch.Size([4, 1]) and abs(out[2][0].item() - 3.3030) < eps
    assert out[3].shape == torch.Size([4, 6]) and abs(out[3][2][3].item() - 0.0858) < eps

test_shape()

(tensor([[-16.1181],
        [-16.1181],
        [-16.1181],
        [-16.1181]]), tensor([[nan, inf, inf, inf, 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.]], dtype=torch.float64), tensor([[ 3.3030],
        [11.3132],
        [25.4562],
        [37.4583]], dtype=torch.float64), tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000],
        [0.0043, 0.0116, 0.0315, 0.0858, 0.2331, 0.6337],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000]], dtype=torch.float64))
4
torch.Size([4, 1])
torch.Size([4, 6])
torch.Size([4, 1])
torch.Size([4, 6])


In [7]:
# eps = 1e-7

# prediction = (torch.tensor([[np.log(10 + 1)]]), 
#               torch.tensor([[100., 0., 0., 0., 0., 0.]]),
#               torch.tensor([[0]]),
#               torch.tensor([[0., 100., 0., 0., 0., 0.]]))

# label = np.array([[10,
#                    list([1., 0., 0., 0., 0., 0.]),
#                    0,
#                    list([0., 1., 0., 0., 0., 0.])]])

# loss = DistributionTargetsLoss()
# out = loss(prediction, label)
# assert abs(out.item() - 0.) < eps
# assert type(out) is torch.Tensor

In [17]:
eps = 1e-7

params = {'head_layers':
             {'CombinedTargetHeadFromRnn':
                 {'in_size': 48,
                  'num_distr_classes': 6,
                  'pos': True,
                  'neg': True,
                  'use_gates': True,
                  'pass_samples': True
                 }
             }
         }

prediction = {'neg_sum': torch.tensor([[-1.]]),
              'neg_distribution': torch.tensor([[1., 0., 0., 0., 0., 0.]]),
              'pos_sum': torch.tensor([[ 1.]]),
              'pos_distribution': torch.tensor([[0., 1., 0., 0., 0., 0.]])}


label = {'neg_sum': np.array([[-1.]]),
         'neg_distribution': np.array([[1., 0., 0., 0., 0., 0.]]),
         'pos_sum': np.array([[ 1.]]),
         'pos_distribution': np.array([[0., 1., 0., 0., 0., 0.]])}

loss = DistributionTargetsLoss()
out = loss(prediction, label)

assert abs(out.item() - 10.703149795532227) < eps
assert type(out) is torch.Tensor

In [63]:
import torch
import numpy as np
from dltranz.loss import DistributionTargetsLoss


def test_best_loss():
    eps = 1e-7

    prediction = {'neg_sum': torch.tensor([[np.log(10 + 1)]]), 
                  'neg_distribution': torch.tensor([[100., 0., 0., 0., 0., 0.]]),
                  'pos_sum': torch.tensor([[0]]),
                  'pos_distribution': torch.tensor([[0., 100., 0., 0., 0., 0.]])}

    label = {'neg_sum': np.array([[10]]),
             'neg_distribution': np.array([[1., 0., 0., 0., 0., 0.]]),
             'pos_sum': np.array([[0]]),
             'pos_distribution': np.array([[0., 1., 0., 0., 0., 0.]])}

    loss = DistributionTargetsLoss()
    out = loss(prediction, label)
    assert abs(out.item() - 0.) < eps
    assert type(out) is torch.Tensor


def test_loss_300():
    eps = 1e-7
    
    prediction = {'neg_sum': torch.tensor([[10]]), 
                  'neg_distribution': torch.tensor([[100., 0., 0., 0., 0., 0.]]),
                  'pos_sum': torch.tensor([[0]]),
                  'pos_distribution': torch.tensor([[0., 100., 0., 0., 0., 0.]])}

    label = {'neg_sum': np.array([[0]]),
             'neg_distribution': np.array([[1., 0., 0., 0., 0., 0.]]),
             'pos_sum': np.array([[0]]),
             'pos_distribution': np.array([[0., 1., 0., 0., 0., 0.]])}

    loss = DistributionTargetsLoss()
    out = loss(prediction, label)
    assert abs(out.item() - 300.) < eps
    assert type(out) is torch.Tensor

    
def test_usual_loss_first():
    eps = 1e-7

    prediction = {'neg_sum': torch.tensor([[-1.]]), 
                  'neg_distribution': torch.tensor([[0.1, 0.2, 0.1, 0.1, 0.3, 0.2]]),
                  'pos_sum': torch.tensor([[ 1.]]),
                  'pos_distribution': torch.tensor([[0.1, 0.2, 0.1, 0.1, 0.3, 0.2]])}

    label = {'neg_sum': np.array([[-1.]]),
             'neg_distribution': np.array([[0.1, 0.2, 0.1, 0.1, 0.3, 0.2]]),
             'pos_sum': np.array([[1.]]),
             'pos_distribution': np.array([[0.1, 0.2, 0.1, 0.1, 0.3, 0.2]])}

    loss = DistributionTargetsLoss()
    out = loss(prediction, label)

    assert abs(out.item() - 12.138458251953125) < eps
    assert type(out) is torch.Tensor


def test_usual_loss_second():
    eps = 1e-7
    
    prediction = {'neg_sum': torch.tensor([[-1.]]), 
                  'neg_distribution': torch.tensor([[0.1, 0.2, 0.1, 0.1, 0.3, 0.2]]),
                  'pos_sum': torch.tensor([[ 1.]]),
                  'pos_distribution': torch.tensor([[0.3, 0.5, 0., 0.1, 0.1, 0.0]])}

    label = {'neg_sum': np.array([[-10.]]),
             'neg_distribution': np.array([[0.5, 0.5, 0.0, 0.0, 0.0, 0.0]]),
             'pos_sum': np.array([[8.]]),
             'pos_distribution': np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.5]])}

    loss = DistributionTargetsLoss()
    out = loss(prediction, label)

    assert abs(out.item() - 38.563011169433594) < eps
    assert type(out) is torch.Tensor

    
def test_one_class():
    eps = 1e-7

    prediction = {'neg_sum': torch.tensor([[-1.]]), 
                  'neg_distribution': torch.tensor([[1., 0., 0., 0., 0., 0.]]),
                  'pos_sum': torch.tensor([[ 1.]]),
                  'pos_distribution': torch.tensor([[0., 1., 0., 0., 0., 0.]])}

    label = {'neg_sum': np.array([[-1.]]),
             'neg_distribution': np.array([[1., 0., 0., 0., 0., 0.]]),
             'pos_sum': np.array([[1.]]),
             'pos_distribution': np.array([[0., 1., 0., 0., 0., 0.]])}

    loss = DistributionTargetsLoss()
    out = loss(prediction, label)

    assert abs(out.item() - 10.703149795532227) < eps
    assert type(out) is torch.Tensor

In [64]:
test_one_class()
test_usual_loss_second()
test_usual_loss_first()
test_loss_300()
test_best_loss()

In [4]:
from dltranz.custom_layers import DropoutEncoder, Squeeze, CatLayer, MLP, TabularRowEncoder, CombinedTargetHeadFromRnn
distr_target_head_config = {
    "in_size": 48,
    "num_distr_classes": 6
}

distr_target_head = CombinedTargetHeadFromRnn(**distr_target_head_config)
x = torch.rand(64, 48)
x = (x, np.ones((64,)), np.ones((64,)))
y = distr_target_head(x)
assert type(y) == tuple and len(y) == 4
assert y[0].shape == y[2].shape == (64, 1) and y[1].shape == y[3].shape == (64, 6)

In [120]:
import torch
import numpy as np

from dltranz.trx_encoder import PaddedBatch
from dltranz.seq_encoder.rnn_encoder import RnnSeqEncoderDistributionTarget


def get_data():
    payload = {'amount': torch.arange(4*10).view(4, 10).float(),
               'event_time': torch.arange(4*10).view(4, 10).float(),
               'mcc_code': torch.arange(4*10).view(4, 10),
               'tr_type': torch.arange(4*10).view(4, 10)
              }
    return PaddedBatch(
                       payload=payload,
                       length=torch.tensor([4, 2, 6, 8])
                      )


def test_shape():
    eps = 1e-5
    
    params = {
        'trx_encoder' : {
            'norm_embeddings': False,
            'embeddings_noise': 0.003,
            'embeddings': {
                'mcc_code': {
                    'in': 200,
                    'out': 48
                },
                'tr_type': {
                    'in': 100,
                    'out': 24
                }
            },
            'numeric_values': {
                'amount': 'identity'
            },
        },
        'rnn': {
            'hidden_size': 48,
            'type': 'gru',
            'bidir': False,
            'trainable_starter': 'static',
        }
    }
    
    
    model = RnnSeqEncoderDistributionTarget(params, True)

    x = get_data()

    out = model(x)
    assert type(out) == tuple and len(out) == 3
    assert type(out[0]) == torch.Tensor and out[0].shape == torch.Size([4, 48])
    assert (out[1] - np.array([-16.118095, -16.118095, -16.118095, -16.118095]) < np.zeros((1, 4)) + eps).all()
    assert (out[2] - np.array([3.302955, 11.313237, 25.456194, 37.45834])< np.zeros((1, 4)) + eps).all()

In [11]:
import torch
import numpy as np

from dltranz.trx_encoder import PaddedBatch
from dltranz.seq_encoder.rnn_encoder import RnnSeqEncoder


def get_data():
    payload = {'amount': torch.arange(4*10).view(4, 10).float(),
               'event_time': torch.arange(4*10).view(4, 10).float(),
               'mcc_code': torch.arange(4*10).view(4, 10),
               'tr_type': torch.arange(4*10).view(4, 10)
              }
    return PaddedBatch(
                       payload=payload,
                       length=torch.tensor([4, 2, 6, 8])
                      )

def test_shape():
    eps = 1e-5
    
    params = {
        'trx_encoder' : {
            'norm_embeddings': False,
            'embeddings_noise': 0.003,
            'embeddings': {
                'mcc_code': {
                    'in': 200,
                    'out': 48
                },
                'tr_type': {
                    'in': 100,
                    'out': 24
                }
            },
            'numeric_values': {
                'amount': 'identity'
            },
        },
        'rnn': {
            'hidden_size': 48,
            'type': 'gru',
            'bidir': False,
            'trainable_starter': 'static'
        },
        'head_layers': {
            'CombinedTargetHeadFromRnn': {
                'in_size': 48,
                'num_distr_classes': 6,
                'pos': True,
                'neg': True,
                'use_gates': True,
                'pass_samples': True
            }
        }
    }
    
    model = RnnSeqEncoder(params, True)

    x = get_data()

    out = model(x)
    assert type(out) == torch.Tensor and out.shape == torch.Size([4, 48])
    assert type(out[0]) == torch.Tensor and out.shape == torch.Size([4, 48])

test_shape()

In [5]:
import torch
import numpy as np

from dltranz.trx_encoder import PaddedBatch
from dltranz.seq_encoder.statistics_encoder import StatisticsEncoder


def get_data():
    payload = {'amount': torch.arange(4*10).view(4, 10).float(),
               'event_time': torch.arange(4*10).view(4, 10).float(),
               'mcc_code': torch.arange(4*10).view(4, 10),
               'tr_type': torch.arange(4*10).view(4, 10)
              }
    return PaddedBatch(
                       payload=payload,
                       length=torch.tensor([4, 2, 6, 8])
                      )

def test_shape():
    config = {'category_names': ['mcc_code', 'tr_type'],
              'numeric_values': ['amount'],
              'category_max_size' : {
                  'mcc_code': 200,
                  'tr_type': 100
              },
    'columns_ix': {'pos_sum': 0,
                 'pos_distribution': 1,
                 'neg_sum': 2,
                 'neg_distribution': 3},
    }

    eps = 1e-4

    model = StatisticsEncoder(config)

    x = get_data()

    out = model(x)
    print(out)
    print(len(out))
    print(out[0].shape)
    print(out[1].shape)
    print(out[2].shape)
    print(out[3].shape)
    assert isinstance(out, tuple) and len(out) == 4

    assert (abs(out[0] -  torch.Tensor([[-16.1181],
                                        [-16.1181],
                                        [-16.1181],
                                        [-16.1181]])) < torch.zeros((4, 1)) + eps).all()
    assert out[1].shape == torch.Size([4, 6]) and out[1][0][3] == 0 and out[1][3][1] == 0
    assert out[2].shape == torch.Size([4, 1]) and abs(out[2][0].item() - 3.3029549820009882) < eps
    assert out[3].shape == torch.Size([4, 6]) and abs(out[3][1][3].item() - 0.7310606456724159) < eps

test_shape()

[[list([0, 1, 2, 3]) list([0.0, 1.0, 2.0, 3.0])]
 [list([10, 11]) list([10.0, 11.0])]
 [list([20, 21, 22, 23, 24, 25])
  list([20.0, 21.0, 22.0, 23.0, 24.0, 25.0])]
 [list([30, 31, 32, 33, 34, 35, 36, 37])
  list([30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0])]]
(4, 2)
(tensor([[-16.1181],
        [-16.1181],
        [-16.1181],
        [-16.1181]]), tensor([[0.],
        [0.],
        [0.],
        [0.]]), tensor([[ 3.3030],
        [11.3132],
        [25.4562],
        [37.4583]], dtype=torch.float64), tensor([[1.],
        [1.],
        [1.],
        [1.]], dtype=torch.float64))
4
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([4, 1])


AssertionError: 

In [239]:
neg = ('top_negative_trx', np.array([2010, 2370, 1010, 1110, 2330, 2371, 2011, 2020, 2331, 1100, 1030, 
                                         1200, 1210, 2210, 2021, 2110, 2340, 2440, 2460, 2320, 4010, 4071,
                                         2341, 2456, 4051, 1310, 1410, 4110, 2100, 2200, 4011, 1000, 4210,
                                         2446, 1510, 4020, 4500, 4041, 4090, 4031, 4021, 4097, 4100, 4061,
                                         2000, 4200, 4096, 4045, 4035]))
pos = ('top_positive_trx', np.array([2010, 2370, 1010, 1110, 2330, 2371, 2011, 2020, 2331, 1100, 1030, 
                                         1200, 1210, 2210, 2021, 2110, 2340, 2440, 2460, 2320, 4010, 4071,
                                         2341, 2456, 4051, 1310, 1410, 4110, 2100, 2200, 4011, 1000, 4210,
                                         2446, 1510, 4020, 4500, 4041, 4090, 4031, 4021, 4097, 4100, 4061,
                                         2000, 4200, 4096, 4045, 4035]))

dict([neg, pos])

{'top_negative_trx': array([2010, 2370, 1010, 1110, 2330, 2371, 2011, 2020, 2331, 1100, 1030,
        1200, 1210, 2210, 2021, 2110, 2340, 2440, 2460, 2320, 4010, 4071,
        2341, 2456, 4051, 1310, 1410, 4110, 2100, 2200, 4011, 1000, 4210,
        2446, 1510, 4020, 4500, 4041, 4090, 4031, 4021, 4097, 4100, 4061,
        2000, 4200, 4096, 4045, 4035]),
 'top_positive_trx': array([2010, 2370, 1010, 1110, 2330, 2371, 2011, 2020, 2331, 1100, 1030,
        1200, 1210, 2210, 2021, 2110, 2340, 2440, 2460, 2320, 4010, 4071,
        2341, 2456, 4051, 1310, 1410, 4110, 2100, 2200, 4011, 1000, 4210,
        2446, 1510, 4020, 4500, 4041, 4090, 4031, 4021, 4097, 4100, 4061,
        2000, 4200, 4096, 4045, 4035])}

In [31]:
test_distribution_target_head()

In [30]:
import torch

from dltranz.custom_layers import DropoutEncoder, Squeeze, CatLayer, MLP, TabularRowEncoder, CombinedTargetHeadFromRnn


class DistributionTargetsHeadTest(torch.nn.Module):
    def forward(self, x):
        return x


def test_distribution_target_head():
    distr_target_head_config = {
        "in_size": 48,
        "num_distr_classes": 6,
        'use_gates': False,
        'pass_samples': False
    }
    
    distr_target_head = CombinedTargetHeadFromRnn(**distr_target_head_config)
    x = torch.rand(64, 48)
    y = distr_target_head(x)
    assert type(y) == dict and len(y) == 4
    assert y['neg_sum'].shape == y['pos_sum'].shape == (64, 1) and y['neg_distribution'].shape == y['pos_distribution'].shape == (64, 6)
    


class TrxEncoderTest(torch.nn.Module):
    def forward(self, x):
        return x


def test_dropout_encoder():
    drop_encoder = DropoutEncoder(p=0.5)
    x = torch.rand(256, 100)

    drop_encoder.eval()
    assert torch.equal(x, drop_encoder(x))

    drop_encoder.train()
    assert x.shape == drop_encoder(x).shape


def test_squeeze():
    x = torch.rand(256, 100, 1)
    squeeze = Squeeze()
    y = squeeze(x)
    assert y.shape == (256, 100)


def test_mlp():
    mlp_config = {
        "hidden_layers_size": [512, 100],
        "drop_p": 0.5,
        "objective": "classification"
    }

    mlp = MLP(512, mlp_config)
    x = torch.rand(256, 512)
    y = mlp(x)
    assert y.shape == (256,)


def test_cat_layer():
    left_tail = torch.nn.Linear(100, 10)
    right_tail = torch.nn.Linear(200, 20)

    cat_layer = CatLayer(left_tail, right_tail)
    l = torch.rand(256, 100)
    r = torch.rand(256, 200)
    x = (l, r)
    y = cat_layer(x)
    assert y.shape == (256, 30)


def test_embedding_generator():
    tabular_config = {
        'num_features_count': 10,
        'cat_features_dims': [10, 10],
        'cat_emb_dim': 4
    }

    tabular_row_encoder = TabularRowEncoder(
        input_dim=tabular_config['num_features_count'] + len(tabular_config['cat_features_dims']),
        cat_dims=tabular_config['cat_features_dims'],
        cat_idxs=[x + tabular_config['num_features_count'] for x in range(len(tabular_config['cat_features_dims']))],
        cat_emb_dim=tabular_config['cat_emb_dim']
    )

    assert tabular_row_encoder.output_size == 18

    num = torch.rand(256, 10)
    cat = torch.randint(0, 10, (256, 2))
    x = torch.cat([num, cat], dim=1)
    y = tabular_row_encoder(x)

    assert y.shape == (256, 18)

In [167]:
import logging

import pandas as pd
import pytorch_lightning as pl
from torch.utils.data import ChainDataset
from torch.utils.data.dataloader import DataLoader

from dltranz.data_load import IterableChain, padded_collate, IterableAugmentations
from dltranz.data_load.augmentations.seq_len_limit import SeqLenLimit
from dltranz.data_load.iterable_processing.category_size_clip import CategorySizeClip
from dltranz.data_load.iterable_processing.feature_filter import FeatureFilter
from dltranz.data_load.iterable_processing.target_extractor import TargetExtractor
from dltranz.data_load.parquet_dataset import ParquetDataset, ParquetFiles
from dltranz.metric_learn.inference_tools import save_scores
from dltranz.train import score_model2
from dltranz.util import get_conf, get_cls

logger = logging.getLogger(__name__)

In [3]:
import logging
import warnings

import torch
from copy import deepcopy
from ignite.contrib.handlers import ProgressBar, LRScheduler, create_lr_scheduler_with_warmup
from ignite.contrib.handlers.param_scheduler import ParamScheduler
from ignite.handlers import ModelCheckpoint
from ignite.metrics import RunningAverage
import numpy as np
import pandas as pd
from math import sqrt

warnings.filterwarnings('ignore', module='tensorboard.compat.tensorflow_stub.dtypes')
from torch.utils.tensorboard import SummaryWriter
from dltranz.trx_encoder import PaddedBatch
from dltranz.swa import SWA

from ignite.engine import Engine, Events, create_supervised_trainer, create_supervised_evaluator
import ignite
from bisect import bisect_right
from tqdm import tqdm

logger = logging.getLogger(__name__)

In [4]:
import pickle
with open('my_config_for_jupyter.pickle', 'rb') as handle:
    conf = pickle.load(handle)

In [5]:
conf['inference_dataloader']['loader']['batch_size'] = 1

In [6]:
import pickle
with open('my_config_for_jupyter_model.pickle', 'rb') as handle:
    conf_model = pickle.load(handle)

In [7]:
!ls

'Distribution check. Inference..ipynb'	 models
 README.md				 my_config_for_jupyter.pickle
 bin					 my_config_for_jupyter_model.pickle
 conf					 notebooks
 data					 results
 distribution_targets.py		 scenario_gender
 lightning_logs


In [8]:
def create_inference_dataloader(conf, pl_module):
    """This is inference dataloader for `experiments`
    """
    post_processing = IterableChain(
        TargetExtractor(target_col=conf['col_id']),
        FeatureFilter(keep_feature_names=pl_module.seq_encoder.category_names),
        CategorySizeClip(pl_module.seq_encoder.category_max_size),
        IterableAugmentations(
            SeqLenLimit(**conf['SeqLenLimit']),
        )
    )
    l_dataset = [
        ParquetDataset(
            ParquetFiles(path).data_files,
            post_processing=post_processing,
            shuffle_files=False,
        ) for path in conf['dataset_files']]
    dataset = ChainDataset(l_dataset)
    return DataLoader(
        dataset=dataset,
        collate_fn=padded_collate,
        shuffle=False,
        num_workers=conf['loader.num_workers'],
        batch_size=conf['loader.batch_size'],
    )

In [9]:
from dltranz.seq_cls import SequenceClassify


model = SequenceClassify(conf_model['params'], None)

In [10]:
# pl_module = get_cls(conf['params.pl_module_class'])
# model = pl_module(conf['params'])

In [11]:
dl = create_inference_dataloader(conf['inference_dataloader'], model)

In [13]:
if 'seed_everything' in conf:
    pl.seed_everything(conf['seed_everything'])

pl_module = get_cls(conf['params.pl_module_class'])

model = pl_module(conf['params'])
# if conf.get('random_model', False):
#     model = pl_module(conf['params'])
# else:
#     model = pl_module.load_from_checkpoint(conf['model_path'])
# model.seq_encoder.is_reduce_sequence = True

dl = create_inference_dataloader(conf['inference_dataloader'], model)

pred, ids = score_model2(model, dl, conf['params'])

df_scores_cols = [f'v{i:003d}' for i in range(pred.shape[1])]
col_id = conf['inference_dataloader.col_id']
df_scores = pd.concat([
    pd.DataFrame({col_id: ids}),
    pd.DataFrame(pred, columns=df_scores_cols),
    ], axis=1)
# logger.info(f'df_scores examples: {df_scores.shape}:')

# save_scores(df_scores, None, conf['output'])

Global seed set to 42
                  

In [14]:
with torch.no_grad():
    for batch in tqdm(dl, leave=False):
        x, *others = batch
        print(x.payload['amount'].shape)
        print(torch.max(x.seq_lens))
        print(others[0].shape)

        x = x.to('cuda:0')
        out = model(x)
        print(out.shape)
        break

        batch_output = [out.cpu().numpy(), *others]
        outputs.append(batch_output)
        

                  

torch.Size([1, 1170])
tensor(1170, dtype=torch.int32)
(1,)
torch.Size([1, 6])




In [15]:
pred.shape

(1, 6)

In [16]:
ids

array(['10915793'], dtype='<U8')

In [17]:
pred

array([[-13855.776 ,  -5865.934 ,   2668.8064,   1957.0739,   5359.918 ,
         -7469.6885]], dtype=float32)

In [20]:
!ls ./data

'Distribution targets.ipynb'		    test_trx.parquet
 agg_feat_embed.pickle			    tr_mcc_codes.csv
 gender_test_kaggle_sample_submission.csv   tr_types.csv
 gender_train.csv			    train_trx.parquet
 gender_train_distribution_gt.csv	    trans-gender-2019.zip
 gender_train_distribution_target.csv	    transactions.csv
 test_ids.csv


In [85]:
df_gt = pd.read_csv('data/gender_train_distribution_gt.csv')
df_gt

Unnamed: 0,customer_id,gender
0,10058778,"(-23111070.490000006, [0.0100318209016029, 0.4..."
1,10230827,"(-2924061.3000000017, [0.029598155141275586, 0..."
2,10280886,"(-4927824.540000005, [0.11142153409544886, 0.7..."
3,11681378,"(-7984107.140000006, [0.27103410087755886, 0.5..."
4,12051455,"(-15816769.950000003, [0.1532716299006423, 0.5..."
...,...,...
13637,80961223,"(-9835408.080000006, [0.09336043228010113, 0.6..."
13638,81846240,"(-6292680.730000001, [0.19184777550282606, 0.3..."
13639,88536358,"(-4745959.820000001, [0.10899479970734352, 0.5..."
13640,89157170,"(-4977460.249999999, [0.007674305384960133, 0...."


In [86]:
df_target = pd.read_csv('data/gender_train_distribution_new_target.csv')
df_target

Unnamed: 0,customer_id,gender
0,10058778,"(-11411767.080000004, [0.014152865096857542, 0..."
1,10230827,"(-3245337.660000001, [0.05784240645085909, 0.8..."
2,10280886,"(-2338431.8499999996, [0.038333907400380327, 0..."
3,11681378,"(-8399862.260000015, [0.36679991345477087, 0.4..."
4,12051455,"(-8028976.4, [0.3203062385387009, 0.3692387002..."
...,...,...
13637,80961223,"(-6731554.2, [0.08209258866251128, 0.649263449..."
13638,81846240,"(-6774924.700000001, [0.13851338303435304, 0.4..."
13639,88536358,"(-2869008.3, [0.06540160235855713, 0.492393486..."
13640,89157170,"(-4320156.710000003, [0.03691662842480543, 0.7..."


In [87]:
import ast
import torch

In [88]:
l_gt = []
l_target = []

for row in df_gt['gender']:
    l_gt += [ast.literal_eval(row)]
for row in df_target['gender']:
    l_target += [ast.literal_eval(row)]
    
gt_neg_distr = np.array(l_gt, dtype=object)[:, 1]
target_neg_distr = np.array(l_target, dtype=object)[:, 1]

out_gt = []
out_target = []
for i in range(len(gt_neg_distr)):
    out_gt += [list(gt_neg_distr[i])]
for i in range(len(target_neg_distr)):
    out_target += [list(target_neg_distr[i])]
out_gt = torch.tensor(np.array(out_gt))
out_target = torch.tensor(np.array(out_target))

In [89]:
out_gt.shape

torch.Size([13642, 6])

In [90]:
out_target.shape

torch.Size([13642, 6])

### Compute cross entropy:

In [106]:
eps = np.zeros(out_gt.shape)
eps + 0.0000001

array([[1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07],
       [1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07],
       [1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07],
       ...,
       [1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07],
       [1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07],
       [1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07, 1.e-07]])

In [150]:
rand = np.random.random(out_gt.shape)
rand = torch.tensor(rand / np.sum(rand, 1)[:, None])
rand

tensor([[0.0884, 0.2174, 0.2524, 0.2136, 0.2198, 0.0083],
        [0.1895, 0.1455, 0.2113, 0.0395, 0.1995, 0.2147],
        [0.1640, 0.0530, 0.1676, 0.2352, 0.2114, 0.1688],
        ...,
        [0.0272, 0.0872, 0.1427, 0.2363, 0.2255, 0.2811],
        [0.0324, 0.0805, 0.2417, 0.2206, 0.2287, 0.1961],
        [0.0115, 0.0545, 0.1542, 0.2670, 0.2859, 0.2269]], dtype=torch.float64)

In [149]:
def cross_entropy(pred, soft_targets):
    eps = np.zeros(out_gt.shape) + 1e-9
    return torch.mean(torch.sum(- soft_targets * torch.log(pred + eps), 1))

In [117]:
cross_entropy(out_gt, out_target)

tensor(1.7447, dtype=torch.float64)

In [118]:
cross_entropy(out_target, out_target)

tensor(0.9839, dtype=torch.float64)

In [151]:
cross_entropy(rand, out_gt)

tensor(2.0668, dtype=torch.float64)