diff --git a/deepctr_torch/layers/interaction.py b/deepctr_torch/layers/interaction.py index d02c2b41..82885526 100644 --- a/deepctr_torch/layers/interaction.py +++ b/deepctr_torch/layers/interaction.py @@ -530,7 +530,7 @@ def forward(self, inputs): moe_out = torch.matmul(output_of_experts, gating_score_of_experts.softmax(1)) x_l = moe_out + x_l # (bs, in_features, 1) - x_l = x_l.squeeze() # (bs, in_features) + x_l = x_l.squeeze(-1) # (bs, in_features) return x_l diff --git a/deepctr_torch/models/basemodel.py b/deepctr_torch/models/basemodel.py index abc5b846..f4755e9c 100644 --- a/deepctr_torch/models/basemodel.py +++ b/deepctr_torch/models/basemodel.py @@ -242,7 +242,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc x = x_train.to(self.device).float() y = y_train.to(self.device).float() - y_pred = model(x).squeeze() + y_pred = model(x) optim.zero_grad() if isinstance(loss_func, list): @@ -251,7 +251,7 @@ def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, initial_epoc loss = sum( [loss_func[i](y_pred[:, i], y[:, i], reduction='sum') for i in range(self.num_tasks)]) else: - loss = loss_func(y_pred, y.squeeze(), reduction='sum') + loss = loss_func(y_pred, y, reduction='sum') reg_loss = self.get_regularization_loss() total_loss = loss + reg_loss + self.aux_loss diff --git a/deepctr_torch/models/multitask/mmoe.py b/deepctr_torch/models/multitask/mmoe.py index c0401eb7..df9f7ca6 100644 --- a/deepctr_torch/models/multitask/mmoe.py +++ b/deepctr_torch/models/multitask/mmoe.py @@ -127,7 +127,7 @@ def forward(self, X): else: gate_dnn_out = self.gate_dnn_final_layer[i](dnn_input) gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), expert_outs) # (bs, 1, dim) - mmoe_outs.append(gate_mul_expert.squeeze()) + mmoe_outs.append(gate_mul_expert.squeeze(1)) # tower dnn (task-specific) task_outs = [] diff --git a/deepctr_torch/models/multitask/ple.py b/deepctr_torch/models/multitask/ple.py index bc8a06fb..c056aefa 100644 --- a/deepctr_torch/models/multitask/ple.py +++ b/deepctr_torch/models/multitask/ple.py @@ -177,7 +177,7 @@ def cgc_net(self, inputs, level_num): else: gate_dnn_out = self.specific_gate_dnn_final_layer[level_num][i](inputs[i]) gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs) # (bs, 1, dim) - cgc_outs.append(gate_mul_expert.squeeze()) + cgc_outs.append(gate_mul_expert.squeeze(1)) # gates for shared experts cur_experts_outputs = specific_expert_outputs + shared_expert_outputs @@ -189,7 +189,7 @@ def cgc_net(self, inputs, level_num): else: gate_dnn_out = self.shared_gate_dnn_final_layer[level_num](inputs[-1]) gate_mul_expert = torch.matmul(gate_dnn_out.softmax(1).unsqueeze(1), cur_experts_outputs) # (bs, 1, dim) - cgc_outs.append(gate_mul_expert.squeeze()) + cgc_outs.append(gate_mul_expert.squeeze(1)) return cgc_outs