In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# (1) Create Datasets

In [2]:
from src.parameters import Parameters
from src.features import Featureset
from src.data_loaders import GroupDataset, GroupDataCollator

#### View Raw Data

In [3]:
import pandas as pd
_users = ['sid', 'datetime', 'cid', 'category', 'duration']
_items = ['cid', 'nsessbuy', 'nbuysess', 'price', 'totbuys', 'totclicks', 'totdurs']
_targets = ['sid', 'datetime', 'cid', 'price', 'quantity']

users = pd.read_csv('datasets/yc_small/users.csv', names=_users)
items = pd.read_csv('datasets/yc_small/items.csv', names=_items)
targets = pd.read_csv('datasets/yc_small/targets.csv', names=_targets)
users.head()
items.head()
targets.head()

Unnamed: 0,sid,datetime,cid,category,duration
0,2551,2014-04-07 05:11:26.029,214827007,0,15.0
1,2551,2014-04-07 05:11:41.029,214827007,0,30.668
2,2551,2014-04-07 05:12:11.697,214827000,0,10.648
3,2551,2014-04-07 05:12:22.345,214827000,0,15.401
4,2551,2014-04-07 05:12:37.746,214827000,0,15.372


Unnamed: 0,cid,nsessbuy,nbuysess,price,totbuys,totclicks,totdurs
0,214507331,233,0.566524,1576.471483,132,58,13321.498
1,214507365,69,1.376812,205.780822,95,53,6121.674
2,214507385,67,4.134328,112.0,277,24,2492.85
3,214507408,48,2.041667,94.117647,98,42,4203.699
4,214507415,29,3.448276,256.363636,100,89,6778.327


Unnamed: 0,sid,datetime,cid,price,quantity
0,2551,2014-04-07 05:22:04.042,0,0,0
1,3309,2014-04-06 11:31:00.291,0,0,0
2,3857,2014-04-02 20:07:35.919,214821285,2092,1
3,3857,2014-04-02 20:07:35.877,214820383,4188,2
4,3857,2014-04-02 20:07:36.017,214819562,1046,1


#### Configure Data

In [4]:
data_config = {
    'sid':Parameters(mode='uid', vector=None),
    'datetime':Parameters(mode='datetime', vector=None),
    'category':Parameters(mode='categorical', vector='embedding'),
    'duration':Parameters(mode='numerical', vector='linear'),
    'nsessbuy':Parameters(mode='numerical', vector='linear'),
    'nbuysess':Parameters(mode='numerical', vector='linear'),
    'totbuys':Parameters(mode='numerical', vector='linear'),
    'totclicks':Parameters(mode='numerical', vector='linear'),
    'totdurs':Parameters(mode='numerical', vector='linear'),
    'price':Parameters(mode='numerical', vector='linear'),
    'cid':Parameters(mode='categorical', vector='embedding'),
}

#### Create and Save Train/Test Sets

In [5]:
from src.yc_utils import YCFeature

In [6]:
_features_yc = Featureset('datasets/yc_small', data_config)
_features_yc.load_yc('users.csv', 'items.csv', 'targets.csv')
_features_yc.create_new_dataset(train_split=0.9, tag=1)

loading dataframe
adding labels and datetimes
updating config
creating features


#### Load and Explore Data

In [7]:
import torch

features = Featureset('datasets/yc_small')
features.load_dataset(tag=1)
                          
dataset = GroupDataset(features.test_features)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, collate_fn = GroupDataCollator(features.config))
data = next(iter(data_loader))

{k:v.shape for k,v in data.items()}

{'label': torch.Size([2, 1]),
 'month': torch.Size([2, 49]),
 'day': torch.Size([2, 49]),
 'wday': torch.Size([2, 49]),
 'oddtime': torch.Size([2, 49, 1]),
 'eventime': torch.Size([2, 49, 1]),
 'category': torch.Size([2, 49]),
 'duration': torch.Size([2, 49, 1]),
 'cid': torch.Size([2, 49]),
 'nsessbuy': torch.Size([2, 49, 1]),
 'nbuysess': torch.Size([2, 49, 1]),
 'price': torch.Size([2, 49, 1]),
 'totbuys': torch.Size([2, 49, 1]),
 'totclicks': torch.Size([2, 49, 1]),
 'totdurs': torch.Size([2, 49, 1]),
 'seq_mask': torch.Size([2, 49])}

In [8]:
{k:features.config[k].size for k,v in data.items() if k in features.config}

{'label': 2,
 'month': 7,
 'day': 32,
 'wday': 8,
 'oddtime': 1,
 'eventime': 1,
 'category': 14,
 'duration': 1,
 'cid': 8440,
 'nsessbuy': 1,
 'nbuysess': 1,
 'price': 1,
 'totbuys': 1,
 'totclicks': 1,
 'totdurs': 1}

# (2) Explore Group Model

In [11]:
from src.modules import MergeLayer, FlattenLayer, GroupLayer, GroupModel
from src.modules import NonLinear, EmbeddingNonLinear
from src.modules import GRU, EmbeddingGRU

In [12]:
model = GroupModel({
    'merge_1':MergeLayer({
        'month':('month',),
        'day':('day',),
        'wday':('wday',),
        'time':('oddtime', 'eventime'),
        'duration':('duration',),
        'popularity':('nsessbuy', 'nbuysess', 'totbuys', 'totclicks', 'totdurs'),
        'price':('price',),
        'cid':('cid',),
        'seq_mask':('seq_mask',)
    }),
    
    'embed':GroupLayer({
        'month':EmbeddingGRU(7, 4, 2, 4),
        'day':EmbeddingGRU(32, 8, 2, 8),
        'wday':EmbeddingGRU(8, 4, 2, 4),
        'time':GRU(2, 4, 4),
        'duration':GRU(1, 2, 4),
        'popularity':GRU(5, 8, 8),
        'price':GRU(1, 2, 4),
        'cid':EmbeddingGRU(8440, 128, 4, 12)
    }), 
    
    'merge_2':MergeLayer({
        'user':('month', 'day', 'wday', 'time', 'duration'),
        'item':('popularity', 'price', 'cid')
    }),
    
    'groups_1':GroupLayer({
        'user':NonLinear(24, 16, 8),
        'item':NonLinear(24, 16, 8),
    }),
    
    'flat':FlattenLayer('item'),
    
    'proj':MergeLayer({
        'pred':('user', 'item'),
    }), 
    
    'pred':GroupLayer({
        'pred':NonLinear(16, 32, 2)
    })
})

model.eval()
pass;

#### Example Prediction

In [13]:
model(data)

tensor([[-0.0994, -0.0606],
        [-0.0971, -0.0703]], grad_fn=<AddmmBackward>)

#### Run data through first layer 'merge_1'

In [26]:
{k:v.shape for k,v in model.layers['merge_1'](data).items()}

{'month': torch.Size([2, 49]),
 'day': torch.Size([2, 49]),
 'wday': torch.Size([2, 49]),
 'time': torch.Size([2, 49, 2]),
 'duration': torch.Size([2, 49, 1]),
 'popularity': torch.Size([2, 49, 5]),
 'price': torch.Size([2, 49, 1]),
 'cid': torch.Size([2, 49]),
 'seq_mask': torch.Size([2, 49])}

#### Run data through first two layers 'merge_1', 'embed'

In [15]:
{k:v.shape for k,v in model.layers['embed'](model.layers['merge_1'](data)).items()}

{'month': torch.Size([2, 1, 4]),
 'day': torch.Size([2, 1, 8]),
 'wday': torch.Size([2, 1, 4]),
 'time': torch.Size([2, 1, 4]),
 'duration': torch.Size([2, 1, 4]),
 'popularity': torch.Size([2, 1, 8]),
 'price': torch.Size([2, 1, 4]),
 'cid': torch.Size([2, 1, 12])}

#### Run 'cid' through 'cid' embedding of second layer

In [19]:
data_cid = model.layers['merge_1'](data)['cid']
model.layers['embed'].blocks['cid'](data_cid, data['seq_mask'])

tensor([[[ 0.0389,  0.1014, -0.0509, -0.1742,  0.1346, -0.2221,  0.0153,
          -0.3181,  0.1523,  0.0308, -0.3138, -0.2741]],

        [[-0.0101, -0.0073,  0.0860, -0.2701,  0.2793, -0.1145, -0.0440,
          -0.1686,  0.1211, -0.0094, -0.2826, -0.2399]]],
       grad_fn=<MeanBackward1>)

# (3) Train Group Model

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import datetime

In [23]:
features = Featureset('datasets/yc_small')
features.load_dataset(tag=1)

trainloader = torch.utils.data.DataLoader(
    GroupDataset(features.train_features), batch_size=100, shuffle=True, 
    collate_fn=GroupDataCollator(features.config)
)

testloader = torch.utils.data.DataLoader(
    GroupDataset(features.test_features), batch_size=100, shuffle=False, 
    collate_fn=GroupDataCollator(features.config)
)

In [24]:
print_every = 2
epochs = 10

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
criterion = nn.CrossEntropyLoss()

In [25]:
time_start = datetime.datetime.now()
print('Start Time: %s'%time_start.strftime('%H:%M:%S'))

for epoch in range(0, epochs):
    epoch_start = datetime.datetime.now()
    model.train();
    train_loss = 0.0
    test_loss = 0.0
    train_nbatches = 0
    test_nbatches = 0
    
    for inputs in trainloader:
        preds = model(inputs)

        loss = criterion(preds, inputs['label'].squeeze())
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        train_loss += float(loss)
        train_nbatches += 1
    
    model.eval();
    with torch.no_grad():
        for inputs in testloader:
            preds = model(inputs)
            loss = criterion(preds, inputs['label'].squeeze())
            test_loss += float(loss)
            test_nbatches += 1

    train_loss/=train_nbatches
    test_loss/=test_nbatches
    
    if epoch%print_every == 0:
        print('Epoch {} || Train Loss: {:.3f} || Test Loss: {:.3f}'.format(
            str(epoch).zfill(3), train_loss, test_loss)
             )
time_finish = datetime.datetime.now()
print('End Time: %s'%time_finish.strftime('%H:%M:%S'))
print('Completed in %s seconds'%(time_finish-time_start).total_seconds())
pass;

Start Time: 11:22:39
Epoch 000 || Train Loss: 0.606 || Test Loss: 0.505
Epoch 002 || Train Loss: 0.531 || Test Loss: 0.494
Epoch 004 || Train Loss: 0.425 || Test Loss: 0.527
Epoch 006 || Train Loss: 0.360 || Test Loss: 0.536
Epoch 008 || Train Loss: 0.311 || Test Loss: 0.608
End Time: 11:26:14
Completed in 215.405964 seconds


In [None]:
model.eval()
pass;