In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import torch
import torch.nn as nn

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Gaussian Dropout for Pytorch

Gaussian dropout multiplies a normal distributed noise with **all** your hidden embeddings dimensions instead of setting some dimensions to 0. So hidden_embedding*normal_noise.
This means, that the mean of your outputs will stay the same, although the standard deviation and variance changes. It might make train/inference performance more stable for regression task than normal dropout.
The noise used here, is similar to the [paper proposing it](https://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf) N(1, (1/(1-p)). We use (1/(1-p)) here instead the reverse, because the noise 
will grow with p, instead the other way around.
See also [discussion here](https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/277155).
Thanks to [Allohvk](https://www.kaggle.com/allohvk) for suggesting it in the first place.

In [None]:
class GaussianDropout(nn.Module):

    def __init__(self, p: float):
        """
        Multiplicative Gaussian Noise dropout with N(1, p/(1-p))
        It is NOT (1-p)/p like in the paper, because here the
        noise actually increases with p. (It can create the same
        noise as the paper, but with reversed p values)

        Source:
        Dropout: A Simple Way to Prevent Neural Networks from Overfitting
        https://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf

        :param p: float - determines the the standard deviation of the
        gaussian noise, where sigma = p/(1-p).
        """
        super().__init__()
        assert 0 <= p < 1
        self.t_mean = torch.ones((0,))
        self.shape = ()
        self.p = p
        self.t_std = self.compute_std()

    def compute_std(self):
        return self.p / (1 - self.p)

    def forward(self, t_hidden):
        if self.training and self.p > 0.:
            if self.t_mean.shape != t_hidden.shape:
                self.t_mean = torch.ones_like(input=t_hidden
                                              , dtype=t_hidden.dtype
                                              , device=t_hidden.device)
            elif self.t_mean.device != t_hidden.device:
                self.t_mean = self.t_mean.to(device=t_hidden.device, dtype=t_hidden.dtype)

            t_gaussian_noise = torch.normal(self.t_mean, self.t_std)
            t_hidden = t_hidden.mul(t_gaussian_noise)
        return t_hidden

# Test-Network

In [None]:
class TestNetwork(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(8, 16)
        self.dropout = GaussianDropout(p=0.2)
        self.linear2 = nn.Linear(16, 8)
        
    def forward(self, t_input):
        t_hidden = self.linear(t_input)
        t_hidden = self.dropout(t_hidden)
        t_hidden = self.linear2(t_hidden)
        return t_hidden

In [None]:
network = TestNetwork()

In [None]:
t_input = torch.randn((64, 80, 8))
t_output = network(t_input)

In [None]:
t_output.shape, t_output

**Maybe it helps you to improve your score :)**