# Contrastive-Predictive-Coding

<img src="./img/overview.png"></img>


#### Encoder: Strided convolutional layers with resnet blocks
#### Autoregressive model: GRUs

<img src="./img/log-bilinear-model.png"></img>

In [None]:
def forward(self, x, hidden):
    batch = x.size()[0]  # batch = 64
    nce = 0  # average over timestep and batch and gpus

    # Strided convolutional layers with resnet blocks
    # input sequence(x) is N*C*L, e.g. 64*1*20480
    z = self.encoder(x)
    # encoded sequence(z) is N*C*L, e.g. 64*512*128

    # reshape to N*L*C for GRU, e.g. 64*128*512
    z = z.transpose(1, 2)

    # 샘플링
    # self.seq_len = 20480 -> "window length to sample from each utterance", utterance: 발화
    # 20480 / 160 = 128
    # self.timestep = 12
    t_samples = torch.randint(int(self.seq_len / 160) - self.timestep, size=(1,)).long()  # high=116, low=0(default)
    encode_samples = torch.empty((self.timestep, batch, 512)).float()  # e.g. size 12*64*512
    for i in np.arange(1, self.timestep + 1):  # 12회 반복
        encode_samples[i - 1] = z[:, t_samples + i, :].view(batch, 512)  # z_tk e.g. size 64*512,  12*64*512



<img src="./img/context-latent-representation.png"></img>

In [None]:
    # GRU
    forward_seq = z[:, : t_samples + 1, :]  # e.g. size 64*100*512
    output1, hidden1 = self.gru1(forward_seq, hidden1)  # output size e.g. 64*100*256
    c_t = output1[:, t_samples, :].view(batch, 128)  # c_t e.g. size 64*128

<img src="./img/overview.png"></img>

<img src="./img/log-bilinear-model.png"></img>   (3)
<img src="./img/prediction.png"></img>

In [None]:
    # Predictions
    pred = torch.empty((self.timestep, batch, 512)).float()  # e.g. size 12*64*512
    
    # self.Wk1 = nn.ModuleList([nn.Linear(128, 512) for i in range(timestep)])   # 12개의 linear 함수
    for i in np.arange(0, self.timestep):  # 12회 반복
        linear = self.Wk1[i]  # c_t 벡터 1개를 반복해서 서로 다른 linear 함수의 input으로 사용함
        pred[i] = linear(c_t)  # Wk*c_t e.g. size 64*512,  결과적으로 pred는 12*64*512, 12steps 만큼 예측함

# Negative sampling

### torch.eq(): Computes element-wise equality
<img src="./img/torch-eq.png"></img>

### torch.argmax(input): Returns the indices of the maximum value of all elements in the input tensor.
### torch.argmax(input, dim, keepdim=False): Returns the indices of the maximum values of a tensor across a dimension.
<img src="./img/torch-argmax-dim.png"></img>

In [None]:
    for i in np.arange(0, self.timestep):  # 12회 반복
        total = torch.mm(encode_samples[i], torch.transpose(pred[i], 0, 1))  # e.g. size 64*64

        # labels of negative sample
        correct = torch.sum(
            torch.eq(
                torch.argmax(self.softmax(total), dim=0),
                torch.arange(0, batch)
            )
        )  # correct is a tensor

        nce += torch.sum(
            torch.diag(
                self.log_softmax(total)
            )
        )  # nce is a tensor

## e.g. correct1 (1)
<img src="./img/correct1.png"></img>
## e.g. correct1 (2)
<img src="./img/correct2.png"></img>

<img src="./img/nce-loss-1.png"></img>
<img src="./img/nce-loss-2.png"></img>

<img src="./img/diag.png"></img>

## take the diagonal, which corresponds to positive samples

## e.g. log softmax, diag, nce loss (1)
<img src="./img/diag1.png"></img>
==========================================================================================================
## e.g. log softmax, diag, nce loss (2)
<img src="./img/diag2.png"></img>

In [None]:
        nce += torch.sum(
            torch.diag(
                self.log_softmax(total)
            )
        )  # nce is a tensor

    nce /= -1.0 * batch * self.timestep
    accuracy = 1.0 * correct.item() / batch

    return accuracy, nce, hidden