In [4]:
import torch
import fairseq
from fairseq import checkpoint_utils


arg_overrides = {
    "apply_mask": True,

    "mask_selection": "static",
    "mask_length": 10,
    "mask_other": 0,
    "mask_prob": 0.75,

    "mask_channel_selection": "static",
    "mask_channel_length": 64,
    "mask_channel_other": 0,
    "mask_channel_prob": 0.5,

    "encoder_layerdrop": 0.0,
    "dropout": 0.0,
    "activation_dropout": 0.1,
    "attention_dropout": 0.0,

    "feature_grad_mult": 0.0,
}


class Encoder(torch.nn.Module):
    def __init__(self, ckpt_path="../weights/hubert_base_ls960.pt"):
        super().__init__()

        state = checkpoint_utils.load_checkpoint_to_cpu(ckpt_path, arg_overrides)
    
        model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path], state=state)
    
        model[0].remove_pretraining_modules()
        
        self.model = model[0]
    
        self.cfg = cfg
      
    def forward(self, source, padding_mask): 
        w2v_args = {
            "source": source, # source: (B, T)
            "padding_mask": padding_mask, # padding_mask: (B, T), 
            "mask": True and self.training,
            "ret_conv": False,
        }
                      
        features, x, padding_mask = self.model.extract_features(**w2v_args)

        return {
            "cnn_out": features,  # B x T x C
            "encoder_out": x,  # B x T x C 
            "padding_mask": padding_mask,  # B x T
        }
        


In [None]:
Encoder()

DEBUG:hydra.core.utils:Setting JobRuntime:name=UNKNOWN_NAME
DEBUG:hydra.core.utils:Setting JobRuntime:name=utils
DEBUG:hydra.core.utils:Setting JobRuntime:name=utils
INFO:fairseq.tasks.hubert_pretraining:current directory is /raid/home/rajivratn/hemant_rajivratn/last/src
INFO:fairseq.tasks.hubert_pretraining:HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
INFO:fairseq.models.hubert.hubert:HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': ge

Encoder(
  (model): HubertModel(
    (feature_extractor): ConvFeatureExtractionModel(
      (conv_layers): ModuleList(
        (0): Sequential(
          (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
          (3): GELU(approximate='none')
        )
        (1-4): 4 x Sequential(
          (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GELU(approximate='none')
        )
        (5-6): 2 x Sequential(
          (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (1): Dropout(p=0.0, inplace=False)
          (2): GELU(approximate='none')
        )
      )
    )
    (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
    (dropout_input): Dropout(p=0.1, inplace=False)
    (dropout_features): Dropout(p=0.1, inplace=False)
    (encoder): Transforme

: 

In [5]:
# create a int tenosr of shape b,t
# b = 2
# t = 10
import torch
x = torch.randint(0, 10, (2, 10))
x

tensor([[2, 0, 4, 9, 5, 3, 7, 8, 6, 2],
        [7, 2, 7, 6, 0, 7, 8, 5, 0, 7]])

In [6]:
x + 1

tensor([[ 3,  1,  5, 10,  6,  4,  8,  9,  7,  3],
        [ 8,  3,  8,  7,  1,  8,  9,  6,  1,  8]])