In [1]:
from typing import Dict, List, Union, Tuple, NamedTuple
from collections import OrderedDict
import tqdm
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch import nn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
movie_data_path =  "/Volumes/ExFAT/dataset/ml-1m/movies.dat"
user_data_path = "/Volumes/ExFAT/dataset/ml-1m/users.dat"
ratings_data_path = "/Volumes/ExFAT/dataset/ml-1m/ratings.dat"

# Dataset

In [3]:
user_columns = ["gender", "age", "occupation", "zipCode"] 
movie_columns = ['title', 'genres']
user_df = pd.read_csv(user_data_path, sep="::", header=None, engine="python", names=user_columns)
item_df = pd.read_csv(movie_data_path, sep="::", header=None, engine="python", names=movie_columns).drop('title', axis=1)

rate_df = pd.read_csv(ratings_data_path, sep="::", engine="python", header=None, names=['userId', 'movieId', 'rating', 'timestamp']).drop('timestamp', axis=1) 

rate_df = rate_df.merge(user_df, left_on=['userId'], right_index=True, how='inner')
rate_df = rate_df.merge(item_df, left_on=['movieId'], right_index=True, how='inner').reset_index(drop=True)

In [4]:
rate_df.iloc[:10]

Unnamed: 0,userId,movieId,rating,gender,age,occupation,zipCode,genres
0,1,1193,5,F,1,10,48067,Drama
1,2,1193,5,M,56,16,70072,Drama
2,12,1193,4,M,25,12,32793,Drama
3,15,1193,4,M,25,7,22903,Drama
4,17,1193,5,M,50,1,95350,Drama
5,18,1193,4,F,18,3,95825,Drama
6,19,1193,5,M,1,10,48073,Drama
7,24,1193,5,F,25,7,10023,Drama
8,28,1193,3,F,25,1,14607,Drama
9,33,1193,5,M,45,3,55421,Drama


In [16]:
target_col = ['rating']
feature_cols = ['userId', 'movieId', 'gender', 'age', 'occupation', 'zipCode']
X = rate_df[feature_cols].copy()
Y = rate_df[target_col].copy()

# Model input

## wide featurs

In [6]:
from typing import Dict, List, Union, Tuple

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [7]:
class CrossFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cross_col_pairs: List[Tuple[str, str]]):
        self.cross_col_pairs = cross_col_pairs
        
    def fit(self, df: pd.DataFrame, y=None):
        self.unique_columns_ = set()
        for pair in self.cross_col_pairs:
            self.unique_columns_.update(list(pair))
        
        self.crossed_colnamed_ = []
        
        for cols in self.cross_col_pairs:
            cols = list(cols)
            new_colname = "_".join(cols)
            self.crossed_colnamed_.append(new_colname)
        return self
        
    def transform(self, df: pd.DataFrame):
            df_cross = df[self.unique_columns_].copy().astype(str)
            
            for cols in self.cross_col_pairs:
                cols = list(cols)
                new_colname = "_".join(cols)
                df_cross[new_colname] = df_cross[cols[0]] + \
                    '-' + df_cross[cols[1]]
            return df_cross[self.crossed_colnamed_]
           
class WideFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, wide_cols: List[str], cross_col_pairs=None):
        self.wide_cols = wide_cols
        self.cross_col_pairs = cross_col_pairs

        self.cross_feature_transformer = CrossFeatures(self.cross_col_pairs)
    
    def fit(self, df: pd.DataFrame, y=None):
        self.cross_feature_transformer_ = self.cross_feature_transformer.fit(df)
        self.crossed_colnamed_ = self.cross_feature_transformer_.crossed_colnamed_

        df_wide = self._generate_wide_features(df)
        self.wide_columns_ = df_wide.columns.tolist()
        self.feature_dict_ = self._generate_global_feature_dict(df_wide)
        return self

    def transform(self, df: pd.DataFrame):
        
        df_wide = self._generate_wide_features(df)
        
        encoded = np.zeros([len(df_wide), len(self.wide_columns_)], dtype=np.long)
        
        for i , col in enumerate(self.wide_columns_):
            encoded[:, i] = df_wide[col].apply(lambda x: self.feature_dict_[col +  '_' + str(x)])
        return encoded.astype('int64')
        

    def _generate_global_feature_dict(self, df: pd.DataFrame):
        columns = df.columns.tolist()
        all_col_value = []
        for col in columns:
            unique_value = [col + '_' + str(x) for x in df[col].unique()]
            all_col_value.extend(unique_value)
        return {v: i + 1 for i, v in enumerate(all_col_value)}


    
    def _generate_wide_features(self, df):
        df_cross = self.cross_feature_transformer_.transform(df)
        df_wide = pd.concat([df[self.wide_cols], df_cross], axis=1)
        return df_wide    


In [8]:
wide_cols = ['gender', 'age', 'occupation', 'zipCode']
crossed_cols = [('gender', 'age'), ('gender', 'occupation'), ('age', 'occupation')]
wideGenerator = WideFeaturesGenerator(wide_cols, crossed_cols)
x_wide = wideGenerator.fit_transform(X)
x_wide.shape

(1000209, 7)

In [9]:
print(wideGenerator.wide_columns_)
print(len(wideGenerator.feature_dict_))

['gender', 'age', 'occupation', 'zipCode', 'gender_age', 'gender_occupation', 'age_occupation']
3659


## deep features

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_encode: List[str]):
        self.columns_to_encode = columns_to_encode

    def fit(self, df: pd.DataFrame, y=None):
        df_ = df[self.columns_to_encode].copy()

        for col in self.columns_to_encode:
            df_[self.columns_to_encode] = df[self.columns_to_encode].astype(
                'str')

        unique_column_vals = {col: df_[col].unique()
                              for col in self.columns_to_encode}

        self.encoding_dict_ = dict()

        for k, v in unique_column_vals.items():
            self.encoding_dict_[k] = {val: idx for idx, val in enumerate(v)}
            self.encoding_dict_[k]['unseen'] = len(self.encoding_dict_[k])

        return self

    def transform(self, df: pd.DataFrame):
        try:
            self.encoding_dict_
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )
        df_ = df.copy()
        df_[self.columns_to_encode] = df_[self.columns_to_encode].astype('str')

        for col, encoding_map in self.encoding_dict_.items():
            original_value = [f for f in encoding_map.keys() if f != 'unseen']
            df_[col] = np.where(df_[col].isin(
                original_value), df_[col], 'unseen')
            df_[col] = df_[col].apply(lambda x: encoding_map[x])
        return df_


class DeepFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, embed_cols: List[str], continuous_cols: List[str]):
        # category type
        self.embed_cols = embed_cols
        self.continuous_cols = continuous_cols 

        self.deep_cols = embed_cols + continuous_cols

    def fit(self, df: pd.DataFrame, y=None):
        label_encoder = LabelEncoder(self.embed_cols)
        self.label_encoder_ = label_encoder.fit(df)
        self.embed_cols_unique_labels_ = {col: len(v) for col, v in self.label_encoder_.encoding_dict_.items()}

        if self.continuous_cols:
            df_continuous = df[self.continuous_cols].copy()
            self.scalar_ = StandardScaler().fit(df_continuous.values.astype('float'))
        return self

    def transform(self, df: pd.DataFrame):
        df_embed = df[self.embed_cols].copy()
        df_continuous = df[self.continuous_cols].copy()

        df_embed = self.label_encoder_.transform(df_embed)
        if self.continuous_cols:
            df_continuous[self.continuous_cols] = self.scalar_.transform(df_continuous.values.astype('float'))
            df_deep = pd.concat([df_embed, df_continuous], axis=1)[self.deep_cols]
        else:
            df_deep = df_embed[self.deep_cols]
        return df_deep.values

In [12]:
for col in X:
    print(col, len(X[col].unique()))

userId 6040
movieId 3706
gender 2
age 7
occupation 21
zipCode 3439


In [13]:
category_embed_dim_mapping = {
    'userId': 50,
    'movieId': 50,
    'gender': 2,
    'age':2,
    'occupation': 5,
    'zipCode': 20
}
category_cols = list(category_embed_dim_mapping.keys())
continuous_cols = []
deep_generator = DeepFeaturesGenerator(category_cols, continuous_cols)
df_deep = X[category_cols + continuous_cols].copy()
x_deep = deep_generator.fit_transform(X)
x_deep.shape

(1000209, 6)

In [14]:
deep_generator.embed_cols_unique_labels_

{'userId': 6041,
 'movieId': 3707,
 'gender': 3,
 'age': 8,
 'occupation': 22,
 'zipCode': 3440}

##  model dataset

In [17]:
import torch

In [18]:
class WideDeepDataset(torch.utils.data.Dataset):
    def __init__(self, x_wide, x_deep, target):
        assert(x_wide.shape[0] == x_deep.shape[0] == target.shape[0])
        self.x_wide = x_wide
        self.x_deep = x_deep
        self.target = target
        
    def __getitem__(self, index):
        return self.x_wide[index], self.x_deep[index], self.target[index]
    
    def __len__(self):
        return self.target.shape[0]

In [19]:
%%time
target_y = Y.values.copy()
target_y = (target_y > 3).astype(int)

CPU times: user 5.5 ms, sys: 3.19 ms, total: 8.69 ms
Wall time: 7.37 ms


In [20]:
x_deep = x_deep
x_wide = x_wide
x_wide.shape, x_deep.shape, target_y.shape

((1000209, 7), (1000209, 6), (1000209, 1))

# Wide & deep Model

In [21]:
from typing import Dict, List, Union, Tuple
import math
import torch
from torch import nn
import torch.nn.functional as F


In [22]:
class Wide(nn.Module):
    def __init__(self, wide_dim: int, predict_dim: int=1):
        super().__init__()
        self.linear = nn.Embedding(wide_dim + 1, predict_dim, padding_idx=0) # reserve 1 dim for unseen cross feature
        self.bias = nn.Parameter(torch.zeros(predict_dim))
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.kaiming_normal_(self.linear.weight, a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.linear.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.bias, -bound, bound)
    
    def forward(self, X: torch.Tensor):

        # X [b_size, num_of_wide_features]
        return self.linear(X.long()).sum(dim=1) + self.bias # [b_size, predict_dim]


class Deep(nn.Module):
    EMBEDDING_LAYER_PREFIX = "emb_layer"
    DENSE_LAYER_PREFIX = "dense_layer"

    def __init__(self,
                 columns_index: Dict[str, int],
                 embed_cols_info: List[Tuple[str, int, int]],  # (col_name, label_size, embeding_dim)
                 continuous_cols: List[str],
                 hidden_layer_neural: List[int],
                 hidden_layer_dropout: List[float],
                 embed_col_dropout: float=0.0
                 ):
        super().__init__()
        self.columns_index = columns_index
        self.embed_cols_info = embed_cols_info
        self.continuous_cols = continuous_cols

        self.embed_layers = self._create_embed_layers(embed_cols_info)
        self.embed_dropout_layer = nn.Dropout(embed_col_dropout)

        self.hidden_layer_neural = self._update_hidden_layer_neural(
            hidden_layer_neural)

        self.dense_layer = self._create_dense_layer(hidden_layer_dropout)
        self.output_dim = hidden_layer_neural[-1]

    def _create_embed_layers(self, embed_cols_info: List[Tuple[str, int, int]]):
        return nn.ModuleDict({self.EMBEDDING_LAYER_PREFIX + '_' + col_name.replace(".", '_'): nn.Embedding(num_label, dim) for col_name, num_label, dim in embed_cols_info})

    def _create_dense_layer(self, hidden_layer_dropout):
        dense_dequential = nn.Sequential()
        for i in range(1, len(self.hidden_layer_neural)):
            dense_dequential.add_module(
                "{}_{}".format(self.DENSE_LAYER_PREFIX, i - 1),
                self._create_dense_component(
                    self.hidden_layer_neural[i-1], self.hidden_layer_neural[i], hidden_layer_dropout[i-1], True)
            )
        return dense_dequential

    def _update_hidden_layer_neural(self, hidden_layer_neurals: List[int]):
        embed_dim = sum([embed[2] for embed in self.embed_cols_info])
        continuous_dim = len(self.continuous_cols)
        return [embed_dim + continuous_dim] + hidden_layer_neurals

    def _create_dense_component(self, input_dim: int, output_dim: int, dropout_ratio: float=0.0, batch_norm=False):
        layers = [
            nn.Linear(input_dim, output_dim),
            nn.LeakyReLU(inplace=True)
        ]
        if batch_norm:
            layers.append(nn.BatchNorm1d(output_dim))
        layers.append(nn.Dropout(dropout_ratio))
        return nn.Sequential(*layers)

    def __get_embeding_layer(self, embed_col):
        embed_col = self.EMBEDDING_LAYER_PREFIX + '_' + embed_col.replace('.', '_')
        return self.embed_layers[embed_col]
        
    def forward(self, deep_input_x: torch.Tensor):
        embed_x = [
           self.__get_embeding_layer(col)(deep_input_x[:, self.columns_index[col]].long())
            for col, _, _ in self.embed_cols_info
        ]

        embed_x = torch.cat(embed_x, 1)

        continuous_cols_idx = [self.columns_index[col]
                              for col in self.continuous_cols]
        continuous_x = deep_input_x[:, continuous_cols_idx].float()

        x = torch.cat([embed_x, continuous_x], dim=1)
        return self.dense_layer(x)  # [b_size, hidden_layer_last_dim]


In [23]:
class WideDeep(nn.Module):
    def __init__(self, wide: nn.Module, deep: nn.Module):
        super().__init__()
        
        deep = nn.Sequential(
            deep,
            nn.Linear(deep.output_dim, 1)
        )
        self.wide_deep = nn.ModuleDict({
            "wide": wide,
            "deep": deep
        })

    def forward(self, x_wide: torch.Tensor, x_deep: torch.Tensor):
        wide_out = self.wide_deep['wide'](x_wide)  # [b_size, 1]
        deep_out = self.wide_deep['deep'](x_deep)  # [b_size, 1]
        out = wide_out + deep_out
        return out.view(-1)
    
    @torch.no_grad()
    def predict(self, x_wide: torch.Tensor, x_deep: torch.Tensor, threshold: int=0.5):
        logistic = self.predict_probs(x_wide, x_deep)
        return (logistic > threshold).int()
    
    @torch.no_grad()
    def predict_probs(self, x_wide: torch.Tensor, x_deep: torch.Tensor):
        out = self.forward(x_wide, x_deep)
        return torch.sigmoid(out.view(-1).float())

        
        

# Training stage

In [24]:
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [25]:
x_wide.shape, x_deep.shape, target_y.shape

((1000209, 7), (1000209, 6), (1000209, 1))

In [26]:
train_x_wide, test_x_wide, train_x_deep, test_x_deep, train_y, test_y = train_test_split(x_wide, x_deep, target_y, test_size=0.2)
train_x_wide.shape, train_x_deep.shape, train_y.shape

((800167, 7), (800167, 6), (800167, 1))

In [27]:
test_x_wide = torch.from_numpy(test_x_wide)
test_x_deep =  torch.from_numpy(test_x_deep)
test_y = torch.from_numpy(test_y)

## evaluation func

In [28]:
@torch.no_grad()
def cal_metrics(model, X: torch.Tensor, Y: torch.Tensor)-> Dict[str, float]:
    model.eval()
    x_wide, x_deep = X

    y_target = Y.detach().numpy()
    y_prob = model.predict_probs(x_wide, x_deep).detach().numpy()
    y_pre_label = (y_prob > 0.5).astype(int)
    tn, fp, fn, tp  = confusion_matrix(y_target, y_pre_label).ravel()
        
    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn)
    acc = (tp + tn) / (tn + fp + fn + tp)
    
    auc = metrics.roc_auc_score(y_target, y_prob)

    model.train()
    return {'prec': precision, 'recall': recall, 'acc': acc, 'auc': auc}
    
def validation_step(model: nn.Module, X: torch.Tensor, Y: torch.Tensor, loss_fn)-> Tuple[float, Dict[str, float]]:
    model.eval()
    with torch.no_grad():
        x_wide, x_deep = X
        y_pre = model(x_wide, x_deep)
        loss = loss_fn(y_pre.view(-1).float(), Y.float().view(-1))  # mean loss
    
    model.train()
    return loss.item(), cal_metrics(model, X, Y)

## training with AdamW

In [29]:
train_data_set = WideDeepDataset(train_x_wide, train_x_deep, train_y)
train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=300)

In [30]:
embed_cols_info = [(col, deep_generator.embed_cols_unique_labels_[col], embed_dim)for col, embed_dim in category_embed_dim_mapping.items()]
deep_column_idx = {col: i for i, col in enumerate(deep_generator.deep_cols)}
hidden_layers = [100, 64, 32]
drop_out = [0.2, 0.2, 0.2]

In [31]:
embed_cols_info

[('userId', 6041, 50),
 ('movieId', 3707, 50),
 ('gender', 3, 2),
 ('age', 8, 2),
 ('occupation', 22, 5),
 ('zipCode', 3440, 20)]

In [32]:
wide = Wide(wide_dim=np.unique(x_wide).shape[0], predict_dim=1)
deep = Deep(deep_column_idx, embed_cols_info, continuous_cols, hidden_layers, drop_out, 0.2)
wide_deep = WideDeep(wide, deep)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
general_optimizer = torch.optim.AdamW(wide_deep.parameters(),  lr=0.001, weight_decay=1e-6)

In [33]:

log_interval = 500
epoch = 6
count = 0
for epoch_i in range(epoch):
    total_loss = 0
   
    tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
    for i, (x_w, x_d, target_y) in enumerate(tk0):
        wide_deep.train()
        y = wide_deep(x_w, x_d)
        loss = criterion(y.view(-1), target_y.float().view(-1))
        
        wide_deep.zero_grad()
        loss.backward()
        general_optimizer.step()
        
        total_loss += loss.item()
        count += 1
        
        if count % log_interval == 0:
            valid_loss, score = validation_step(wide_deep, (test_x_wide, test_x_deep), test_y, criterion)
            tk0.set_postfix(train_loss=total_loss/log_interval, valid_loss=valid_loss, metrics={k: np.round(v, 4) for k, v in score.items()})
            total_loss = 0

100%|██████████| 2668/2668 [02:04<00:00, 21.36it/s, metrics={'prec': 0.7153, 'recall': 0.7992, 'acc': 0.7019, 'auc': 0.7586}, train_loss=0.593, valid_loss=0.574]
100%|██████████| 2668/2668 [01:41<00:00, 26.27it/s, metrics={'prec': 0.737, 'recall': 0.8025, 'acc': 0.722, 'auc': 0.7848}, train_loss=0.553, valid_loss=0.548] 
100%|██████████| 2668/2668 [01:48<00:00, 24.56it/s, metrics={'prec': 0.7297, 'recall': 0.8277, 'acc': 0.7248, 'auc': 0.79}, train_loss=0.542, valid_loss=0.542]  
100%|██████████| 2668/2668 [02:12<00:00, 20.16it/s, metrics={'prec': 0.7334, 'recall': 0.8221, 'acc': 0.7261, 'auc': 0.7922}, train_loss=0.536, valid_loss=0.539]
100%|██████████| 2668/2668 [02:27<00:00, 18.14it/s, metrics={'prec': 0.7357, 'recall': 0.8191, 'acc': 0.727, 'auc': 0.793}, train_loss=0.53, valid_loss=0.538]   
100%|██████████| 2668/2668 [01:51<00:00, 23.89it/s, metrics={'prec': 0.732, 'recall': 0.8274, 'acc': 0.7268, 'auc': 0.7935}, train_loss=0.528, valid_loss=0.538] 


In [34]:
cal_metrics(wide_deep, (test_x_wide, test_x_deep), test_y)

{'prec': 0.7346040827749187,
 'recall': 0.8222365306974802,
 'acc': 0.7272072864698413,
 'auc': 0.793531092886336}

# Online Serving

## Feature Provider

In [35]:
class FeatureProvider:
    def __init__(self, df: pd.DataFrame, index_col_name):
        self.df = df.copy()
        self.index_col_name = index_col_name
    
    def query_features(self, index: List[int], cols: List[str])-> pd.DataFrame:
        df = self.df
        return df[df[self.index_col_name].isin(index)][cols].copy()

In [36]:
user_df = pd.read_csv(user_data_path, sep="::", header=None, engine="python", names=user_columns)
user_df = user_df.reset_index().rename(columns={'index': 'userId'})

item_df = pd.read_csv(movie_data_path, sep="::", header=None, engine="python", names=movie_columns)
item_df = item_df.reset_index().rename(columns={'index': 'movieId'})

user_feature_provider = FeatureProvider(user_df, 'userId')
item_feature_provider = FeatureProvider(item_df, 'movieId')

In [37]:
user_feature_provider.query_features(index=[5, 10], cols=['age', 'gender'])

Unnamed: 0,age,gender
4,25,M
9,35,F


## Embedding Provider

In [38]:
class EmbeddingProvider:
    def __init__(self, embedding_dict: nn.ModuleDict, prefix='emb_layer'):
        self.prefix= prefix
        self.embedding_dict = embedding_dict
        
    def query_embedding(self, batch_labels:np.array, label_order:List[str])-> torch.Tensor:
        # batch_label: [b_size, num of labels]
        label_order = [self.prefix + '_' + str(label) for label in label_order]
        
        batch_labels = torch.from_numpy(batch_labels).long()
        embed_X = [
           self.embedding_dict[label](batch_labels[:, idx]) for idx, label in enumerate(label_order)
        ]
        return torch.cat(embed_X, 1) # [b_size, sum of all embedding dims]
        

In [39]:
embedding_provider = EmbeddingProvider(wide_deep.wide_deep['deep'][0].embed_layers, 'emb_layer')

In [40]:
embedding_provider.embedding_dict

ModuleDict(
  (emb_layer_age): Embedding(8, 2)
  (emb_layer_gender): Embedding(3, 2)
  (emb_layer_movieId): Embedding(3707, 50)
  (emb_layer_occupation): Embedding(22, 5)
  (emb_layer_userId): Embedding(6041, 50)
  (emb_layer_zipCode): Embedding(3440, 20)
)

## Ranker

In [41]:
class WideDeepOnlineRanker(nn.Module):
    def __init__(self, offlineWideDeep: nn.Module):
        super().__init__()
        self.deep_dense = self._fetch_deep_dense(offlineWideDeep)
        self.wide_part = offlineWideDeep.wide_deep['wide']
    
    def _fetch_deep_dense(self, offlineWideDeep):
        deep_dense = []
        for dense_layer in offlineWideDeep.wide_deep['deep'][0].dense_layer:
            new_layer = nn.Sequential(*[d for d in dense_layer if not isinstance(d, nn.Dropout)])
            deep_dense.append(new_layer)
        deep_dense.append(offlineWideDeep.wide_deep['deep'][1])

        return nn.Sequential(*deep_dense)
    

    def forward(self, wide_x: torch.Tensor, deep_x_embedding: torch.Tensor):
        wide_output = self.wide_part(wide_x.long())
        deep_output = self.deep_dense(deep_x_embedding)
        
        
        return (wide_output + deep_output).view(-1)
    
    @torch.no_grad()
    def scoring(self, wide_x: torch.Tensor, deep_x_embedding: torch.Tensor):
        self.eval()
        return self.forward(wide_x, deep_x_embedding).view(-1).float()
    
    @torch.no_grad()
    def predict_probs(self, wide_x: torch.Tensor, deep_x_embedding: torch.Tensor):
        self.eval()
        out = self.forward(wide_x, deep_x_embedding)
        return torch.sigmoid(out.view(-1)).float()
        
        
    
    

In [42]:
ranker = WideDeepOnlineRanker(wide_deep)

## Ranking Stage

In [56]:
user_id = 1000
match_itemId = np.random.choice(item_df['movieId'], 30)
match_itemId

array([1496, 1643, 2040, 3515, 1711, 2822, 1959,   35, 3023, 2651, 2776,
       2095,  245, 3285, 2926, 2952, 3890, 3208, 2841, 3221,  256, 1381,
       1937, 1885, 3132,  729,  190, 1407, 1894, 3947])

In [57]:
query_user_features_cols = ['userId', 'gender', 'age', 'occupation', 'zipCode']
query_item_features_cols = ['movieId']

user_primitive_feature = user_feature_provider.query_features([user_id], query_user_features_cols)
item_primitive_feature = item_feature_provider.query_features(list(match_itemId), query_item_features_cols)

user_primitive_feature['join'] = 1
item_primitive_feature['join'] = 1
primitive_features = user_primitive_feature.merge(item_primitive_feature, on = ['join']).drop('join', axis=1)
processed_wide_features = wideGenerator.transform(primitive_features)
processed_deep_features = deep_generator.transform(primitive_features)


embedding_label_order = ['userId', 'movieId', 'gender', 'age', 'occupation', 'zipCode']
embedding_features = embedding_provider.query_embedding(processed_deep_features, embedding_label_order)

processed_wide_features = torch.from_numpy(processed_wide_features).long()

In [58]:
scores = ranker.predict_probs(processed_wide_features, embedding_features)

sorted_index = np.argsort(scores.detach().numpy(), axis=0)[::-1]
for idx in sorted_index:
    movieId = match_itemId[idx]
    title = item_feature_provider.query_features([movieId], ['title'])['title'].item()
    
    print("movieId:{} \t '{}' \t score:{}".format(movieId, title,  round(scores[idx].item(), 3)))

movieId:3023 	 'My Best Girl (1927)' 	 score:0.826
movieId:1937 	 'Going My Way (1944)' 	 score:0.815
movieId:1711 	 'Midnight in the Garden of Good and Evil (1997)' 	 score:0.788
movieId:2776 	 'Marcello Mastroianni: I Remember Yes, I Remember (1997)' 	 score:0.774
movieId:1381 	 'Grease 2 (1982)' 	 score:0.773
movieId:245 	 'Glass Shield, The (1994)' 	 score:0.769
movieId:1643 	 'Mrs. Brown (Her Majesty, Mrs. Brown) (1997)' 	 score:0.763
movieId:3208 	 'Loaded Weapon 1 (1993)' 	 score:0.735
movieId:1885 	 'Opposite of Sex, The (1998)' 	 score:0.717
movieId:3221 	 'Draughtsman's Contract, The (1982)' 	 score:0.703
movieId:3285 	 'Beach, The (2000)' 	 score:0.697
movieId:1959 	 'Out of Africa (1985)' 	 score:0.644
movieId:729 	 'Institute Benjamenta, or This Dream People Call Human Life (1995)' 	 score:0.642
movieId:3890 	 'Back Stage (2000)' 	 score:0.637
movieId:1496 	 'Anna Karenina (1997)' 	 score:0.621
movieId:256 	 'Junior (1994)' 	 score:0.59
movieId:2651 	 'Frankenstein Meets t