In [2]:
import sys
import pandas as pd
import numpy as np

In [3]:
sys.path.append("/Volumes/MacUSB/Macbook/08 USB/OpenSource/pytorch-widedeep")

# Data preparation

In [4]:
data_path = "/Volumes/ExFAT/dataset/adult.csv"
df = pd.read_csv(data_path)
len(df)

48842

In [5]:
df.iloc[:5]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
%%time
df.columns = df.columns.map(lambda c: c.replace('-', '_'))

CPU times: user 305 µs, sys: 0 ns, total: 305 µs
Wall time: 311 µs


In [7]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational_num',
       'marital_status', 'occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

## wide features

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import Dict, List, Union, Tuple

In [9]:
class CrossFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, cross_col_pairs: List[Tuple[str, str]]):
        self.cross_col_pairs = cross_col_pairs
        
    def fit(self, df: pd.DataFrame, y=None):
        self.unique_columns_ = set()
        for pair in self.cross_col_pairs:
            self.unique_columns_.update(list(pair))
        
        self.crossed_colnamed_ = []
        
        for cols in self.cross_col_pairs:
            cols = list(cols)
            new_colname = "_".join(cols)
            self.crossed_colnamed_.append(new_colname)
        return self
        
    def transform(self, df: pd.DataFrame):
            df_cross = df[self.unique_columns_].copy()
            
            for cols in self.cross_col_pairs:
                cols = list(cols)
                new_colname = "_".join(cols)
                df_cross[new_colname] = df_cross[cols[0]] + \
                    '-' + df_cross[cols[1]]
            return df_cross[self.crossed_colnamed_]
           
class WideFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, wide_cols: List[str], cross_col_pairs=None):
        self.wide_cols = wide_cols
        self.cross_col_pairs = cross_col_pairs

        self.cross_feature_transformer = CrossFeatures(self.cross_col_pairs)
    
    def fit(self, df: pd.DataFrame, y=None):
        self.cross_feature_transformer_ = self.cross_feature_transformer.fit(df)
        self.crossed_colnamed_ = self.cross_feature_transformer_.crossed_colnamed_

        df_wide = self._generate_wide_features(df)
        self.wide_columns_ = df_wide.columns.tolist()
        self.feature_dict_ = self._generate_global_feature_dict(df_wide)
        return self

    def transform(self, df: pd.DataFrame):
        
        df_wide = self._generate_wide_features(df)
        
        encoded = np.zeros([len(df_wide), len(self.wide_columns_)], dtype=np.long)
        
        for i , col in enumerate(self.wide_columns_):
            encoded[:, i] = df_wide[col].apply(lambda x: self.feature_dict_[col +  '_' + str(x)])
        return encoded.astype('int64')
        

    def _generate_global_feature_dict(self, df: pd.DataFrame):
        columns = df.columns.tolist()
        all_col_value = []
        for col in columns:
            unique_value = [col + '_' + str(x) for x in df[col].unique()]
            all_col_value.extend(unique_value)
        return {v: i + 1 for i, v in enumerate(all_col_value)}


    
    def _generate_wide_features(self, df):
        df_cross = self.cross_feature_transformer_.transform(df)
        df_wide = pd.concat([df[self.wide_cols], df_cross], axis=1)
        return df_wide    


In [10]:
wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']
crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]


In [11]:
wideGenerator = WideFeaturesGenerator(wide_cols, crossed_cols)
x_wide = wideGenerator.fit_transform(df)
x_wide.shape

(48842, 8)

In [12]:
x_wide

array([[  1,  17,  23, ...,  89,  91, 316],
       [  2,  18,  23, ...,  89,  92, 317],
       [  3,  18,  24, ...,  89,  93, 318],
       ...,
       [  2,  20,  23, ...,  90, 103, 323],
       [  2,  17,  23, ...,  89, 103, 323],
       [  2,  21,  29, ...,  90, 115, 324]])

## deep features

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
class LabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_encode: List[str]):
        self.columns_to_encode = columns_to_encode

    def fit(self, df: pd.DataFrame, y=None):
        df_ = df[self.columns_to_encode].copy()

        for col in self.columns_to_encode:
            df_[self.columns_to_encode] = df[self.columns_to_encode].astype(
                'str')

        unique_column_vals = {col: df_[col].unique()
                              for col in self.columns_to_encode}

        self.encoding_dict_ = dict()

        for k, v in unique_column_vals.items():
            self.encoding_dict_[k] = {val: idx for idx, val in enumerate(v)}
            self.encoding_dict_[k]['unseen'] = len(self.encoding_dict_[k])

        return self

    def transform(self, df: pd.DataFrame):
        try:
            self.encoding_dict_
        except AttributeError:
            raise NotFittedError(
                "This LabelEncoder instance is not fitted yet. "
                "Call 'fit' with appropriate arguments before using this LabelEncoder."
            )
        df_ = df.copy()
        df_[self.columns_to_encode] = df_[self.columns_to_encode].astype('str')

        for col, encoding_map in self.encoding_dict_.items():
            original_value = [f for f in encoding_map.keys() if f != 'unseen']
            df_[col] = np.where(df_[col].isin(
                original_value), df_[col], 'unseen')
            df_[col] = df_[col].apply(lambda x: encoding_map[x])
        return df_


class DeepFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, embed_cols: List[str], continuous_cols: List[str]):
        # category type
        self.embed_cols = embed_cols
        self.continuous_cols = continuous_cols 

        self.deep_cols = embed_cols + continuous_cols

    def fit(self, df: pd.DataFrame, y=None):
        label_encoder = LabelEncoder(self.embed_cols)
        self.label_encoder_ = label_encoder.fit(df)
        self.embed_cols_unique_labels_ = {col: len(v) for col, v in self.label_encoder_.encoding_dict_.items()}

        df_continuous = df[self.continuous_cols].copy()
        self.scalar_ = StandardScaler().fit(df_continuous.values.astype('float'))
        return self

    def transform(self, df: pd.DataFrame):
        df_embed = df[self.embed_cols].copy()
        df_continuous = df[self.continuous_cols].copy()

        df_embed = self.label_encoder_.transform(df_embed)
        df_continuous[self.continuous_cols] = self.scalar_.transform(df_continuous.values.astype('float'))
        df_deep = pd.concat([df_embed, df_continuous], axis=1)[self.deep_cols]
        return df_deep.values

In [15]:
category_embed_dim_mapping = {
    'education': 10,
    'relationship': 8,
    'workclass': 10,
    'occupation': 10,
    'native_country': 10
}
category_cols = list(category_embed_dim_mapping.keys())
continuous_cols = ['age', 'hours_per_week']


In [16]:
deep_generator = DeepFeaturesGenerator(category_cols, continuous_cols)
df_deep = df[category_cols + continuous_cols].copy()
x_deep = deep_generator.fit_transform(df)
x_deep.shape

(48842, 7)

## Model input 

In [17]:
import torch

In [18]:
class WideDeepDataset(torch.utils.data.Dataset):
    def __init__(self, x_wide, x_deep, target):
        assert(x_wide.shape[0] == x_deep.shape[0] == target.shape[0])
        self.x_wide = x_wide
        self.x_deep = x_deep
        self.target = target
        
    def __getitem__(self, index):
        return self.x_wide[index], self.x_deep[index], self.target[index]
    
    def __len__(self):
        return self.target.shape[0]

In [19]:
df['income'].iloc[:5]

0    <=50K
1    <=50K
2     >50K
3     >50K
4    <=50K
Name: income, dtype: object

In [20]:
%%time
target_y = df['income'].values.copy()
target_y = (target_y == '>50K').astype(int)

CPU times: user 2.04 ms, sys: 518 µs, total: 2.55 ms
Wall time: 2.16 ms


In [42]:
x_deep = x_deep
x_wide = x_wide
x_wide.shape, x_deep.shape, target_y.shape

((48842, 8), (48842, 7), (48842,))

#### debug

In [43]:
dataset = WideDeepDataset(x_deep, x_wide, target_y)

In [44]:
len(dataset)

48842

# Define model graph

In [63]:
from typing import Dict, List, Union, Tuple
import math
import torch
from torch import nn
import torch.nn.functional as F


In [64]:
class Wide(nn.Module):
    def __init__(self, wide_dim: int, predict_dim: int=1):
        super().__init__()
        self.linear = nn.Embedding(wide_dim + 1, predict_dim, padding_idx=0)
        self.bias = nn.Parameter(torch.zeros(predict_dim))
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.kaiming_normal_(self.linear.weight, a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.linear.weight)
        bound = 1 / math.sqrt(fan_in)
        nn.init.uniform_(self.bias, -bound, bound)
    
    def forward(self, X: torch.Tensor):

        # X [b_size, num_of_wide_features]
        return self.linear(X.long()).sum(dim=1) + self.bias # [b_size, predict_dim]


class Deep(nn.Module):
    EMBEDDING_LAYER_PREFIX = "emb_layer"
    DENSE_LAYER_PREFIX = "dense_layer"

    def __init__(self,
                 columns_index: Dict[str, int],
                 embed_cols_info: List[Tuple[str, int, int]],  # (col_name, label_size, embeding_dim)
                 continuous_cols: List[str],
                 hidden_layer_neural: List[int],
                 hidden_layer_dropout: List[float],
                 embed_col_dropout: float=0.0
                 ):
        super().__init__()
        self.columns_index = columns_index
        self.embed_cols_info = embed_cols_info
        self.continuous_cols = continuous_cols

        self.embed_layers = self._create_embed_layers(embed_cols_info)
        self.embed_dropout_layer = nn.Dropout(embed_col_dropout)

        self.hidden_layer_neural = self._update_hidden_layer_neural(
            hidden_layer_neural)

        self.dense_layer = self._create_dense_layer(hidden_layer_dropout)
        self.output_dim = hidden_layer_neural[-1]

    def _create_embed_layers(self, embed_cols_info: List[Tuple[str, int, int]]):
        return nn.ModuleDict({self.EMBEDDING_LAYER_PREFIX + '_' + col_name.replace(".", '_'): nn.Embedding(num_label, dim) for col_name, num_label, dim in embed_cols_info})

    def _create_dense_layer(self, hidden_layer_dropout):
        dense_dequential = nn.Sequential()
        for i in range(1, len(self.hidden_layer_neural)):
            dense_dequential.add_module(
                "{}_{}".format(self.DENSE_LAYER_PREFIX, i - 1),
                self._create_dense_component(
                    self.hidden_layer_neural[i-1], self.hidden_layer_neural[i], hidden_layer_dropout[i-1], True)
            )
        return dense_dequential

    def _update_hidden_layer_neural(self, hidden_layer_neurals: List[int]):
        embed_dim = sum([embed[2] for embed in self.embed_cols_info])
        continuous_dim = len(self.continuous_cols)
        return [embed_dim + continuous_dim] + hidden_layer_neurals

    def _create_dense_component(self, input_dim: int, output_dim: int, dropout_ratio: float=0.0, batch_norm=False):
        layers = [
            nn.Linear(input_dim, output_dim),
            nn.LeakyReLU(inplace=True)
        ]
        if batch_norm:
            layers.append(nn.BatchNorm1d(output_dim))
        layers.append(nn.Dropout(dropout_ratio))
        return nn.Sequential(*layers)

    def __get_embeding_layer(self, embed_col):
        embed_col = self.EMBEDDING_LAYER_PREFIX + '_' + embed_col.replace('.', '_')
        return self.embed_layers[embed_col]
        
    def forward(self, deep_input_x: torch.Tensor):
        embed_x = [
           self.__get_embeding_layer(col)(deep_input_x[:, self.columns_index[col]].long())
            for col, _, _ in self.embed_cols_info
        ]

        embed_x = torch.cat(embed_x, 1)

        continuous_cols_idx = [self.columns_index[col]
                              for col in self.continuous_cols]
        continuous_x = deep_input_x[:, continuous_cols_idx].float()

        x = torch.cat([embed_x, continuous_x], dim=1)
        return self.dense_layer(x)  # [b_size, hidden_layer_last_dim]


In [65]:
class WideDeep(nn.Module):
    def __init__(self, wide: nn.Module, deep: nn.Module):
        super().__init__()
        
        deep = nn.Sequential(
            deep,
            nn.Linear(deep.output_dim, 1)
        )
        self.wide_deep = nn.ModuleDict({
            "wide": wide,
            "deep": deep
        })

    def forward(self, x_wide: torch.Tensor, x_deep: torch.Tensor):
        wide_out = self.wide_deep['wide'](x_wide)  # [b_size, num_of_wide_feature, wide_predict_dim]
        deep_out = self.wide_deep['deep'](x_deep)  # [b_size, num]
        out = wide_out + deep_out
        return out.view(-1)
    
    @torch.no_grad()
    def predict(self, x_wide: torch.Tensor, x_deep: torch.Tensor, threshold: int=0.5):
        logistic = self.predict_probs(x_wide, x_deep)
        return (logistic > threshold).int()
    
    @torch.no_grad()
    def predict_probs(self, x_wide: torch.Tensor, x_deep: torch.Tensor):
        out = self.forward(x_wide, x_deep)
        return torch.sigmoid(out.view(-1).float())

        
        

### FTRL optimizer

In [66]:
import torch
from torch import nn
from torch.optim.optimizer import Optimizer

In [90]:
class FTRL(Optimizer):
    def __init__(self, params, alpha=1.0, beta=1.0, l1=1.0, l2=1.0):
        if not 0.0 < alpha:
            raise ValueError("Invalid alpha parameter: {}".format(alpha))
        if not 0.0 < beta:
            raise ValueError("Invalid beta parameter: {}".format(beta))
        if not 0.0 <= l1:
            raise ValueError("Invalid l1 parameter: {}".format(l1))
        if not 0.0 <= l2:
            raise ValueError("Invalid l2 parameter: {}".format(l2))


        defaults = dict(alpha=alpha, beta=beta, l1=l1, l2=l2)
        super(FTRL, self).__init__(params, defaults)
    
    @torch.no_grad()
    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()
        
        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]
                if len(state) == 0:
                    state['z'] = torch.zeros_like(p.data)
                    state['n'] = torch.zeros_like(p.data)
               
                # previous z and n
                z, n = state['z'], state['n']
                
                
                w = (group['l1'] * z.sign()) / (group['l2'] + (group['beta'] + n.sqrt()) / group['alpha'])
                # update p weight by condition
                p.data = torch.where(z.abs() < torch.tensor(group['l1'], dtype=torch.float), torch.zeros_like(w), w)

                sigma = ((torch.pow(grad, 2) + n) - n.sqrt()) / group['alpha']
                z.add_(grad - sigma * p.data)
                n.add_(torch.pow(grad, 2))
               



# Training stage

In [68]:
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics

### preprocessing

In [69]:
data_path = "/Volumes/ExFAT/dataset/adult.csv"
df = pd.read_csv(data_path)
df.columns = df.columns.map(lambda c: c.replace('-', '_'))
len(df)

48842

In [70]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'educational_num',
       'marital_status', 'occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [71]:
wide_cols = ['education', 'relationship','workclass','occupation','native_country','gender']
crossed_cols = [('education', 'occupation'), ('native_country', 'occupation')]

category_embed_dim_mapping = {
    'education': 10,
    'relationship': 8,
    'workclass': 10,
    'occupation': 10,
    'native_country': 10
}
category_cols = list(category_embed_dim_mapping.keys())
continuous_cols = ['age', 'hours_per_week']

In [72]:
wideGenerator = WideFeaturesGenerator(wide_cols, crossed_cols)
x_wide = wideGenerator.fit_transform(df)
x_wide.shape

(48842, 8)

In [73]:
deep_generator = DeepFeaturesGenerator(category_cols, continuous_cols)
df_deep = df[category_cols + continuous_cols].copy()
x_deep = deep_generator.fit_transform(df)
x_deep.shape

(48842, 7)

In [74]:
target_y = df['income'].values.copy()
target_y = (target_y == '>50K').astype(int)

In [75]:
x_wide.shape, x_deep.shape, target_y.shape

((48842, 8), (48842, 7), (48842,))

In [76]:
train_x_wide, test_x_wide, train_x_deep, test_x_deep, train_y, test_y = train_test_split(x_wide, x_deep, target_y, test_size=0.2)
train_x_wide.shape, train_x_deep.shape, train_y.shape

((39073, 8), (39073, 7), (39073,))

In [77]:
test_x_wide = torch.from_numpy(test_x_wide)
test_x_deep =  torch.from_numpy(test_x_deep)
test_y = torch.from_numpy(test_y)

### Training

In [60]:
@torch.no_grad()
def cal_metrics(model, X, Y):
    model.eval()
    x_wide, x_deep = X

    y_target = Y.detach().numpy()
    y_prob = model.predict_probs(x_wide, x_deep).detach().numpy()
    y_pre_label = (y_prob > 0.5).astype(int)
    tn, fp, fn, tp  = confusion_matrix(y_target, y_pre_label).ravel()
        
    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn)
    acc = (tp + tn) / (tn + fp + fn + tp)
    
    auc = metrics.roc_auc_score(y_target, y_prob)

    model.train()
    return {'prec': precision, 'recall': recall, 'acc': acc, 'auc': auc}
    
def validation_step(model: nn.Module, X, Y, loss_fn):
    model.eval()
    with torch.no_grad():
        x_wide, x_deep = X
        y_pre = model(x_wide, x_deep)
        loss = loss_fn(y_pre.view(-1).float(), Y.float())  # mean loss
    
    model.train()
    return loss.item(), cal_metrics(model, X, Y)

In [61]:
train_data_set = WideDeepDataset(train_x_wide, train_x_deep, train_y)
train_data_loader = torch.utils.data.DataLoader(train_data_set, batch_size=64)



In [62]:
embed_cols_info = [(col, deep_generator.embed_cols_unique_labels_[col], embed_dim)for col, embed_dim in category_embed_dim_mapping.items()]
deep_column_idx = {col: i for i, col in enumerate(deep_generator.deep_cols)}
hidden_layers = [64, 32]
drop_out = [0.2, 0.2]
# out_dim = hidden_layers[-1]


#### Train the whole network with AdamW

In [129]:
wide = Wide(wide_dim=np.unique(x_wide).shape[0], predict_dim=1)
deep = Deep(deep_column_idx, embed_cols_info, continuous_cols, hidden_layers, drop_out, 0.2)
wide_deep = WideDeep(wide, deep)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
general_optimizer = torch.optim.AdamW(wide_deep.parameters())

In [127]:
log_interval = 50
epoch = 10
for epoch_i in range(epoch):
    total_loss = 0
   
    tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
    for i, (x_w, x_d, target_y) in enumerate(tk0):
        wide_deep.train()
        y = wide_deep(x_w, x_d)
        loss = criterion(y.view(-1), target_y.float())
        
        wide_deep.zero_grad()
        loss.backward()
        general_optimizer.step()
#         wide_optimizer.step()
#         deep_optimizer.step()
        
        total_loss += loss.item()
        
        if (i + 1) % log_interval == 0:
            valid_loss, score = validation_step(wide_deep, (test_x_wide, test_x_deep), test_y, criterion)
            tk0.set_postfix(train_loss=total_loss/log_interval, valid_loss=valid_loss, metrics={k: np.round(v, 4) for k, v in score.items()})
            total_loss = 0

100%|██████████| 611/611 [00:06<00:00, 90.09it/s, metrics={'prec': 0.6518, 'recall': 0.55, 'acc': 0.8225, 'auc': 0.8645}, train_loss=0.407, valid_loss=tensor(0.3802)]
100%|██████████| 611/611 [00:06<00:00, 89.99it/s, metrics={'prec': 0.6757, 'recall': 0.5363, 'acc': 0.8279, 'auc': 0.8752}, train_loss=0.392, valid_loss=tensor(0.3654)]
100%|██████████| 611/611 [00:07<00:00, 81.90it/s, metrics={'prec': 0.6861, 'recall': 0.5354, 'acc': 0.8307, 'auc': 0.8789}, train_loss=0.378, valid_loss=tensor(0.3601)]
100%|██████████| 611/611 [00:07<00:00, 85.39it/s, metrics={'prec': 0.6931, 'recall': 0.5358, 'acc': 0.8326, 'auc': 0.882}, train_loss=0.376, valid_loss=tensor(0.3560)]
100%|██████████| 611/611 [00:07<00:00, 79.44it/s, metrics={'prec': 0.6941, 'recall': 0.5431, 'acc': 0.8339, 'auc': 0.8836}, train_loss=0.374, valid_loss=tensor(0.3536)]
100%|██████████| 611/611 [00:09<00:00, 62.39it/s, metrics={'prec': 0.6933, 'recall': 0.5363, 'acc': 0.8327, 'auc': 0.8854}, train_loss=0.369, valid_loss=tenso

In [128]:
cal_metrics(wide_deep, (test_x_wide, test_x_deep), test_y)

{'acc': 0.8340669464633023,
 'auc': 0.887210434803874,
 'prec': 0.6927252985884907,
 'recall': 0.5474045474045474}

#### Train wide deep with different optmizer

In [127]:
wide = Wide(wide_dim=np.unique(x_wide).shape[0], predict_dim=1)
deep = Deep(deep_column_idx, embed_cols_info, continuous_cols, hidden_layers, drop_out, 0.2)
wide_deep = WideDeep(wide, deep)

criterion = torch.nn.BCEWithLogitsLoss(reduction='mean')
wide_optimizer = FTRL(wide_deep.wide_deep['wide'].parameters(), alpha=1, beta=1, l1=0.5, l2=0)
deep_optimizer = torch.optim.Adagrad(wide_deep.wide_deep['deep'].parameters(), lr=0.01, lr_decay=0.9, weight_decay=0.01)

In [None]:

log_interval = 100
for epoch_i in range(20):
    total_loss = 0
   
    tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
    for i, (x_w, x_d, target_y) in enumerate(tk0):
        wide_deep.train()
        y = wide_deep(x_w, x_d)
        loss = criterion(y.view(-1), target_y.float())
        
        wide_deep.zero_grad()
        loss.backward()

        wide_optimizer.step()
        deep_optimizer.step()
        
        total_loss += loss.item()
        
        if (i + 1) % log_interval == 0:
            valid_loss, score = validation_step(wide_deep, (test_x_wide, test_x_deep), test_y, criterion)
            tk0.set_postfix(train_loss=total_loss/log_interval, valid_loss=valid_loss, metrics={k: np.round(v, 4) for k, v in score.items()})
            total_loss = 0

100%|██████████| 611/611 [00:08<00:00, 75.23it/s, metrics={'prec': 0.2759, 'recall': 0.9863, 'acc': 0.3791, 'auc': 0.7567}, train_loss=1.03, valid_loss=0.988]
100%|██████████| 611/611 [00:07<00:00, 76.66it/s, metrics={'prec': 0.3035, 'recall': 0.9266, 'acc': 0.4751, 'auc': 0.7406}, train_loss=0.911, valid_loss=0.863]
100%|██████████| 611/611 [00:07<00:00, 84.81it/s, metrics={'prec': 0.3061, 'recall': 0.9116, 'acc': 0.4857, 'auc': 0.7319}, train_loss=0.862, valid_loss=0.848]
100%|██████████| 611/611 [00:07<00:00, 82.06it/s, metrics={'prec': 0.3263, 'recall': 0.9039, 'acc': 0.5317, 'auc': 0.7469}, train_loss=0.841, valid_loss=0.794]
100%|██████████| 611/611 [00:05<00:00, 102.82it/s, metrics={'prec': 0.3303, 'recall': 0.8567, 'acc': 0.5513, 'auc': 0.7399}, train_loss=0.817, valid_loss=0.757]
100%|██████████| 611/611 [00:06<00:00, 95.50it/s, metrics={'prec': 0.3209, 'recall': 0.9048, 'acc': 0.5203, 'auc': 0.745}, train_loss=0.798, valid_loss=0.799]
100%|██████████| 611/611 [00:06<00:00, 10

In [None]:
cal_metrics(wide_deep, (test_x_wide, test_x_deep), test_y)

In [None]:
wide_deep.wide_deep['wide'].linear.weight.shape

In [None]:
weight = wide_deep.wide_deep['wide'].linear.weight.view(-1)
weight.nonzero().shape

In [None]:
weight