In [38]:
df = pd.DataFrame(
    {
        "user_id": ["bfdbda", "gdsgdsb", "fsafsa", "bsdbfds", "fsafas"],
        "num_feature": [0.412, -0.3213, 1.763, 0.8754, -1.2532],
        "cat_feature": [0, 1, 1, 0, 2],
        "num_sequence": [[0.532, 0.321, 0.764], [0.1321, 0.7655, 0.3211], [0.32532, 0.101, 0.0643], [0.6342, 0.765, 0.345], [0.6431, 0.865, 0.194]],
        "cat_sequence": [[0, 2, 1], [1, 1, 0], [2, 0, 0], [0, 1, 1], [2, 2, 1]],
        "target": [0, 1, 0, 1, 1]
    }
)

df["cat_feature_cat"] = df["cat_feature"].astype("category")
df.dtypes

user_id              object
num_feature         float64
cat_feature           int64
num_sequence         object
cat_sequence         object
target                int64
cat_feature_cat    category
dtype: object

In [1]:
"""Module with Pandas dataset implementation."""

from typing import Any

import numpy as np
import pandas as pd
from torch.utils.data import Dataset


class PandasDataset(Dataset):
    """Pandas dataset implementation."""
    def __init__(
        self,
        dataframe: pd.DataFrame,
        return_dicts: bool = False,
        target_col: str | None = None,
    ):
        """Instantiate dataset.

        Args:
            dataframe: input dataframe to wrap as toch dataset
            return_dicts: flag to return dict or tensor of values
            target_col: target column name
        """
        self.dataframe = dataframe
        self.return_dicts = return_dicts
        self.target_col = target_col

    def __len__(self) -> int:
        """Get dataset length.

        Returns:
            Dataset length
        """
        return len(self.dataframe)

    def  __getitem__(self, index: int) -> dict[str, Any] | Any:
        """Get dataset item by index.

        Args:
            index: index to take

        Returns:
            Dataset element under specified index
        """
        row = self.dataframe.iloc[index]
        if self.return_dicts:
            return {k: v if not isinstance(v, (list, tuple)) else np.array(v) for k, v in row.to_dict().items()}

        if not self.target_col:
            return row.values

        return row.drop(self.target_col).values, row[self.target_col]


In [57]:
df

Unnamed: 0,user_id,num_feature,cat_feature,num_sequence,cat_sequence,target,cat_feature_cat
0,bfdbda,0.412,0,"[0.532, 0.321, 0.764]","[0, 2, 1]",0,0
1,gdsgdsb,-0.3213,1,"[0.1321, 0.7655, 0.3211]","[1, 1, 0]",1,1
2,fsafsa,1.763,1,"[0.32532, 0.101, 0.0643]","[2, 0, 0]",0,1
3,bsdbfds,0.8754,0,"[0.6342, 0.765, 0.345]","[0, 1, 1]",1,0
4,fsafas,-1.2532,2,"[0.6431, 0.865, 0.194]","[2, 2, 1]",1,2


In [None]:
pd_types.is_integer_dtype

3

In [130]:
df[feature].values

array([list([0.532, 0.321, 0.764]), list([0.1321, 0.7655, 0.3211]),
       list([0.32532, 0.101, 0.0643]), list([0.6342, 0.765, 0.345]),
       list([0.6431, 0.865, 0.194])], dtype=object)

In [129]:
pd_types.is_float_dtype(np.concat(df[feature].values))

True

In [None]:
np.concat(np.array([]))

In [133]:
feature = "cat_sequence"

print("numeric:", pd_types.is_numeric_dtype(df[feature].dtype))
print("object:", pd_types.is_object_dtype(df[feature].dtype))
print("category:", isinstance(df[feature].dtype, pd.CategoricalDtype))
print("numerical_sequence:", pd_types.is_list_like(df.at[0, feature]) and pd_types.is_float_dtype(np.concat(df[feature].values)))
print("categorical_sequence:", pd_types.is_list_like(df.at[0, feature]) and pd_types.is_integer_dtype(np.concat(df[feature].values)))

numeric: False
object: True
category: False
numerical_sequence: False
categorical_sequence: True


In [97]:
df.at[0, "num_feature"]

np.float64(0.412)

In [None]:
import sys

sys.path.append("../models/")

from pandas.api.types import is_numeric_dtype, is_list_like

from typing import Sequence

from common.features.config import FeaturesConfig


def _infer_features_config_from_dataframe(
    self,
    data: pd.DataFrame,
    default_embedding_size: int = 10,
    custom_embedding_sizes: dict[str, int] | None = None,
    embedded_features: Sequence[str] | None = None,
) -> FeaturesConfig:
    """Create feature config from pandas dataframe.

    Args:
        data: pandas dataframe to infrence features config
        default_embedding_size: default features embedding size
        custom_embedding_sizes: custom embeddings mapping {feature: feature: embedding_size}
        embedded_features: numerical features to embed

    Returns:
        Created features config
    """
    custom_embedding_sizes = custom_embedding_sizes or {}

    features = []
    for col in data.columns:
        if is_numeric_dtype(data[col].dtype):
            embed_params = {}
            if col in embedded_features:
                embed_params = {
                    "needs_embed": True,
                    "embedding_size": custom_embedding_sizes.get(col, default_embedding_size),
                }

            feature = Feature(
                name=col,
                feature_type=FeatureType.NUMERICAL,
                **embed_params

            )
            features.append(feature)
            continue

        if isinstance(df[col].dtype, pd.CategoricalDtype):
            uniq_categories = df[col].nunique()
            if df[col].min() != 0 and df[col].max() != uniq_categories - 1:
                LOGGER.warning(
                    "Numer of unique `%s` values is %s, not feature is in interval [%s, %s]",
                    col,
                    uniq_categories,
                    df[col].min(),
                    df[col].max(),
                )

            feature = Feature(
                name=col,
                feature_type=FeatureType.CATEGORICAL,
                needs_embed=True,
                embedding_size=custom_embedding_sizes.get(col, default_embedding_size),
                embedding_vocab_size=uniq_categories,
                
            )
            features.append(feature)
            continue

        if not is_list_like(df[col]):
            raise RuntimeError(f"Feature `{col}` is not categorical, not numerical and not list-like, need to encode feature.")
        list_value = df.at[0, col]

        features.append(feature)

In [2]:
df = pd.DataFrame(
    {
        "user_id": ["bfdbda", "gdsgdsb", "fsafsa", "bsdbfds", "fsafas"],
        "num_feature": [0.412, -0.3213, 1.763, 0.8754, -1.2532],
        "cat_feature": [0, 1, 1, 0, 2],
        "num_sequence": [[0.532, 0.321, 0.764], [0.1321, 0.7655, 0.3211], [0.32532, 0.101, 0.0643], [0.6342, 0.765, 0.345], [0.6431, 0.865, 0.194]],
        "cat_sequence": [[0, 2, 1], [1, 1, 0], [2, 0, 0], [0, 1, 1], [2, 2, 1]],
        "target": [0, 1, 0, 1, 1]
    }
)
df

Unnamed: 0,user_id,num_feature,cat_feature,num_sequence,cat_sequence,target
0,bfdbda,0.412,0,"[0.532, 0.321, 0.764]","[0, 2, 1]",0
1,gdsgdsb,-0.3213,1,"[0.1321, 0.7655, 0.3211]","[1, 1, 0]",1
2,fsafsa,1.763,1,"[0.32532, 0.101, 0.0643]","[2, 0, 0]",0
3,bsdbfds,0.8754,0,"[0.6342, 0.765, 0.345]","[0, 1, 1]",1
4,fsafas,-1.2532,2,"[0.6431, 0.865, 0.194]","[2, 2, 1]",1


In [3]:
ds = PandasDataset(df, return_dicts=True)

In [4]:
from torch.utils.data import DataLoader

loader = DataLoader(ds, batch_size=5)

for batch in loader:
    break
batch

{'user_id': ['bfdbda', 'gdsgdsb', 'fsafsa', 'bsdbfds', 'fsafas'],
 'num_feature': tensor([ 0.4120, -0.3213,  1.7630,  0.8754, -1.2532], dtype=torch.float64),
 'cat_feature': tensor([0, 1, 1, 0, 2]),
 'num_sequence': tensor([[0.5320, 0.3210, 0.7640],
         [0.1321, 0.7655, 0.3211],
         [0.3253, 0.1010, 0.0643],
         [0.6342, 0.7650, 0.3450],
         [0.6431, 0.8650, 0.1940]], dtype=torch.float64),
 'cat_sequence': tensor([[0, 2, 1],
         [1, 1, 0],
         [2, 0, 0],
         [0, 1, 1],
         [2, 2, 1]]),
 'target': tensor([0, 1, 0, 1, 1])}

In [5]:
import sys

sys.path.append("../models/")


from common.features.config import Feature, FeaturesConfig, FeatureType
from common.modules import EmbeddingLayer

In [17]:
df.dtypes

user_id          object
num_feature     float64
cat_feature       int64
num_sequence     object
cat_sequence     object
target            int64
dtype: object

In [14]:
config = FeaturesConfig(
    features=[
        Feature(
            name="num_feature",
            feature_type=FeatureType.NUMERICAL,
            needs_embed=False,
            embedding_size=10,
        ),
        Feature(
            name="cat_feature",
            feature_type=FeatureType.CATEGORICAL,
            needs_embed=True,
            embedding_vocab_size=3,
            embedding_size=8,
        ),
        Feature(
            name="num_sequence",
            feature_type=FeatureType.NUMERICAL_SEQUENCE,
            needs_embed=True,
            feature_size=3,
            embedding_size=5,
        ),
        Feature(
            name="cat_sequence",
            feature_type=FeatureType.CATEGORICAL_SEQUENCE,
            needs_embed=True,
            feature_size=3,
            embedding_vocab_size=3,
            embedding_size=7,
        ),
    ]
)

module = EmbeddingLayer(features_config=config)

In [15]:
batch_output = module(batch)
batch_output.shape

applying for num_feature
feature shape: torch.Size([5, 1])
module: Identity()
output shape: torch.Size([5, 1])
applying for cat_feature
feature shape: torch.Size([5])
module: Embedding(3, 8)
output shape: torch.Size([5, 8])
applying for num_sequence
feature shape: torch.Size([5, 3])
module: Linear(in_features=3, out_features=5, bias=False)
output shape: torch.Size([5, 5])
applying for cat_sequence
feature shape: torch.Size([5, 3])
module: EmbeddingBag(3, 7, mode='mean')
output shape: torch.Size([5, 7])


torch.Size([5, 21])

In [21]:
batch

{'user_id': ['bfdbda', 'gdsgdsb', 'fsafsa'],
 'num_feature': tensor([ 0.4120, -0.3213,  1.7630], dtype=torch.float64),
 'cat_feature': tensor([0, 1, 1]),
 'num_sequence': tensor([[0.5320, 0.3210, 0.7640],
         [0.1321, 0.7655, 0.3211],
         [0.3253, 0.1010, 0.0643]], dtype=torch.float64),
 'cat_sequence': tensor([[0, 2, 1],
         [1, 1, 0],
         [2, 0, 0]]),
 'target': tensor([0, 1, 0])}

In [None]:
import pandas as pd

df = pd.DataFrame({"user_id": [1, 2, 3], "feature": [-0.5, -0.3, 0.5], "target": [0, 0, 1]})
df

Unnamed: 0,user_id,feature,target
0,1,-0.5,0
1,2,-0.3,0
2,3,0.5,1


In [1]:
import sys

sys.path.append("../")
sys.path.append("../models")

from models.common.modules.crossnet import CrossNetV2, CrossNetMix

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
import torch


BATCH_SIZE = 64
IN_FEATURES = 32
LOW_RANK_DIM = 16
NUM_LAYERS = 4
NUM_EXPERTS = 4

input_ = torch.randn(BATCH_SIZE, IN_FEATURES)
cnv2 = CrossNetV2(in_features=IN_FEATURES, num_layers=NUM_LAYERS)
cnmix = CrossNetMix(in_features=IN_FEATURES, num_layers=NUM_LAYERS, low_rank_dim=LOW_RANK_DIM, num_experts=NUM_EXPERTS)

In [3]:
cnv2(input_).shape, cnmix(input_).shape

(torch.Size([64, 32]), torch.Size([64, 32]))

In [23]:
from torch import nn

from models.common.modules.mlp import MLPBlock


OUT_FEATURES = 512
HIDDENS = [64, 128, 256]
# HIDDENS = None

mlp = MLPBlock(in_features=IN_FEATURES, out_features=OUT_FEATURES, hidden_dims=HIDDENS, activation_fn=nn.Tanh, use_batch_norm=False, dropout=0.5)

In [24]:
mlp.mlp

Sequential(
  (0): Linear(in_features=32, out_features=64, bias=True)
  (1): Tanh()
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=64, out_features=128, bias=True)
  (4): Tanh()
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=128, out_features=256, bias=True)
  (7): Tanh()
  (8): Dropout(p=0.5, inplace=False)
  (9): Linear(in_features=256, out_features=512, bias=True)
)

In [26]:
a = torch.arange(21).reshape(3, 7)
a

tensor([[ 0,  1,  2,  3,  4,  5,  6],
        [ 7,  8,  9, 10, 11, 12, 13],
        [14, 15, 16, 17, 18, 19, 20]])

In [32]:
start_idx = 5
size = 3
a[:, start_idx:start_idx + size]

tensor([[ 5,  6],
        [12, 13],
        [19, 20]])

In [35]:
a.size(-1)

7

In [6]:
from dcnv2.model.model import DCNv2, DCNv2Config, ModelStructure
from common.features.config import Feature, FeaturesConfig
from common.features.types import FeatureType

In [7]:
features = [
    Feature(
        name="product_id",
        feature_type=FeatureType.CATEGORICAL,
        needs_embed=True,
        embedding_size=50,
        embedding_vocab_size=20000,
    ),
    Feature(
        name="ui4",
        feature_type=FeatureType.CATEGORICAL,
    ),
    Feature(
        name="price",
        feature_type=FeatureType.NUMERICAL,
    ),
    Feature(
        name="sell_idx",
        feature_type=FeatureType.NUMERICAL,
        needs_embed=True,
        embedding_size=20,
    ),
    Feature(
        name="last_sells",
        feature_type=FeatureType.SEQUENTIAL,
        feature_size=10,
        needs_embed=True,
        embedding_size=30,
    ),
    Feature(
        name="last_sells_embed_v2",
        feature_type=FeatureType.SEQUENTIAL,
        feature_size=30,
    ),
]
features_config = FeaturesConfig(features=features)
model_config = DCNv2Config(model_structure=ModelStructure.CROSSNET_ONLY)

In [8]:
model = DCNv2(model_config, features_config, is_dict_input=False)

TypeError: 'Field' object cannot be interpreted as an integer