In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
import sys
import torch
import torch.nn as nn

In [2]:
sys.path.append('../experiments/')
from entity_embeddings import EmbeddingDataset, EmbeddingNN, train_embedding_model

# Пример создания для данных на задаче регрессии

In [3]:
data = pd.read_csv("../data/train_house.csv",
                   usecols=["SalePrice", "MSSubClass", "MSZoning",
                            "LotFrontage", "LotArea",
                            "Street", "YearBuilt", "LotShape",
                            "1stFlrSF", "2ndFlrSF"])\
    .dropna()

In [4]:
categorical_features = ["MSSubClass", "MSZoning", "Street",
                        "LotShape", "YearBuilt"]
output_feature = "SalePrice"
label_encoders = {}
for cat_col in categorical_features:
    label_encoders[cat_col] = LabelEncoder()
    data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])

In [5]:
dataset = EmbeddingDataset(data=data, cat_cols=categorical_features,
                             output_col=output_feature)

In [6]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [7]:
cat_dims = [int(data[col].nunique()) for col in categorical_features]

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmbeddingNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001, 0.01])\
    .to(device)

In [9]:
adam = torch.optim.Adam(model.parameters(), lr=0.1)
train_embedding_model(model, dataloader, 20, criterion=nn.MSELoss(), optimizer=adam)

loss on epoch 0 is 31653382144.0
loss on epoch 1 is 40591888384.0
loss on epoch 2 is 33609148416.0
loss on epoch 3 is 40009641984.0
loss on epoch 4 is 33472055296.0
loss on epoch 5 is 25158291456.0
loss on epoch 6 is 25481789440.0
loss on epoch 7 is 16446581760.0
loss on epoch 8 is 13591344128.0
loss on epoch 9 is 15855728640.0
loss on epoch 10 is 9510128640.0
loss on epoch 11 is 13012738048.0
loss on epoch 12 is 6817533440.0
loss on epoch 13 is 7188567040.0
loss on epoch 14 is 2470854912.0
loss on epoch 15 is 1941467904.0
loss on epoch 16 is 6082973184.0
loss on epoch 17 is 1037755648.0
loss on epoch 18 is 1175869696.0
loss on epoch 19 is 1302700160.0


In [10]:
model.get_embeddings(categorical_features, emb_dims, label_encoders)['MSSubClass']

{20: array([-0.7870872 , -0.49407044,  0.846892  ,  0.5226263 , -0.7122718 ,
         0.336337  ,  0.69049966, -2.9306023 ], dtype=float32),
 30: array([ 0.06120728,  7.5524015 ,  5.48609   , -4.231609  , -2.482136  ,
        -6.7954063 ,  2.8159876 , -1.7217172 ], dtype=float32),
 40: array([-5.121115  ,  2.0965571 ,  3.4675832 , -2.4394968 , -0.667673  ,
        -3.17188   ,  2.3463995 ,  0.10500234], dtype=float32),
 45: array([-0.12385683,  1.0070275 ,  0.8798747 , -0.2558223 ,  0.05788507,
        -1.7363998 ,  1.9569706 , -1.0792664 ], dtype=float32),
 50: array([-2.700498  ,  4.6035533 ,  4.8640394 , -4.5774317 ,  0.30567202,
        -3.7660825 ,  3.4442585 ,  0.11111352], dtype=float32),
 60: array([-1.3953934 , -1.3787576 ,  0.06052   ,  1.9462501 , -0.8253576 ,
         2.2801714 , -0.82285506, -3.4347765 ], dtype=float32),
 70: array([ 0.19685721,  1.2797385 , -0.7367304 , -2.8977635 ,  0.6347324 ,
        -3.5114489 ,  3.2559671 ,  0.11255725], dtype=float32),
 75: array([ 

# Пример для задачи бинарной классификации классификации

In [11]:
data_cls = pd.read_csv('../data/train_titanic.csv')

In [12]:
data_cls.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
data_cls.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

In [14]:
target_col = 'Survived'
cat_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin', 'Embarked']

In [15]:
label_encoders = {}
for cat_col in cat_features:
    label_encoders[cat_col] = LabelEncoder()
    data_cls[cat_col] = label_encoders[cat_col].fit_transform(data_cls[cat_col].astype(str))

In [16]:
data_cls.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,2,1,21,1,0,7.25,147,2
1,1,0,0,45,1,0,71.2833,81,0
2,1,2,0,27,0,0,7.925,147,2
3,1,0,0,41,1,0,53.1,55,2
4,0,2,1,41,0,0,8.05,147,2


In [17]:
dataset = EmbeddingDataset(data=data_cls, cat_cols=cat_features,
                             output_col=target_col)

In [18]:
batchsize = 64
dataloader = DataLoader(dataset, batchsize, shuffle=True, num_workers=1)

In [19]:
cat_dims = [int(data_cls[col].nunique()) for col in cat_features]

emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmbeddingNN(emb_dims, no_of_cont=1, lin_layer_sizes=[50, 100],
                          output_size=1, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001, 0.01], model_class='binary')\
    .to(device)

In [21]:
adam = torch.optim.Adam(model.parameters(), lr=0.1)
train_embedding_model(model, dataloader, 20, criterion=nn.BCELoss(), optimizer=adam)

loss on epoch 0 is 0.7110211253166199
loss on epoch 1 is 0.4497784376144409
loss on epoch 2 is 0.3560353219509125
loss on epoch 3 is 0.40626198053359985
loss on epoch 4 is 0.1668795943260193
loss on epoch 5 is 0.250690221786499
loss on epoch 6 is 0.2656845152378082
loss on epoch 7 is 0.20558780431747437
loss on epoch 8 is 0.4073295295238495
loss on epoch 9 is 0.32697761058807373
loss on epoch 10 is 0.2856265604496002
loss on epoch 11 is 0.3316341042518616
loss on epoch 12 is 0.4161083400249481
loss on epoch 13 is 0.26774469017982483
loss on epoch 14 is 0.2513344883918762
loss on epoch 15 is 0.2684926390647888
loss on epoch 16 is 0.23651123046875
loss on epoch 17 is 0.2445516437292099
loss on epoch 18 is 0.207131490111351
loss on epoch 19 is 0.20494842529296875


In [27]:
model.get_embeddings(cat_features, emb_dims, label_encoders)['Cabin']

-0.61088437,
         0.00783544, -0.90933686,  0.83201665, -0.2799008 ,  1.6863673 ,
         1.3888685 ,  0.7337036 , -0.800677  ,  0.64153224, -1.6488008 ,
        -1.7859871 , -0.9640279 , -0.21603125, -0.3421344 ,  0.6155089 ,
         0.51508415,  1.3407    ,  0.1669496 ,  0.579204  ,  1.1117525 ,
         0.82779384, -1.3212218 ,  0.35587445,  0.32449466, -1.3796703 ,
        -0.77507615, -0.30449045, -0.33723956,  0.94772255, -1.6539141 ],
       dtype=float32),
 'E33': array([ 1.5588336 ,  0.8723398 ,  1.0295342 ,  0.98515606,  3.4804194 ,
        -0.25394237, -2.117647  ,  0.4877733 , -0.9070787 ,  0.32252648,
         0.67030984,  1.2076787 , -1.3533561 , -0.49138075, -0.5946518 ,
         1.15389   , -0.3861341 , -1.8273549 , -1.9876901 ,  0.6763173 ,
         1.3026589 , -1.4922061 ,  0.3039347 , -1.0704025 , -0.53828824,
        -1.5531526 ,  0.2990917 ,  0.04933483, -0.303803  , -1.3733301 ,
        -0.11677662,  1.5789039 ,  2.1074438 ,  2.0461743 , -1.901809  ,
       