In [None]:
import torch

x = torch.tensor([
    [1, 1, 2, 2, 3, 3, 2,3,2,3],
])

# Step 1: Create mask to keep first element and non-duplicates
mask = torch.ones_like(x, dtype=torch.bool)
mask[:, 1:] = x[:, 1:] != x[:, :-1]

# Step 2: Remove a specific index (e.g., 3)
remove_val = 3
mask &= x != remove_val  # Elementwise AND: keep only those not equal to 3

# Apply mask (as an example)
filtered = [x[i][mask[i]] for i in range(x.size(0))]
print(filtered)

[tensor([1, 2, 2, 2])]


[tensor([1, 2, 3, 2, 3, 2])]


In [11]:
import torch
import re

# Define vocab (index 0 is blank)
vocab = ['_', 'a', 'b', 'c']
blank_id = 0

# Example model output (index sequence)
output = torch.tensor([1, 1, 0, 2, 2, 0, 2, 2, 3, 3, 0])

# Step 1: Collapse consecutive repeats
# Create a mask to keep only non-duplicate consecutive elements
mask = torch.ones_like(output, dtype=torch.bool)
mask[1:] = output[1:] != output[:-1]
collapsed = output[mask]
print(collapsed)
# Step 2: Remove blanks (i.e., tokens with index 0)
final = collapsed[collapsed != blank_id]
print(final)
# Step 3: Convert indices to characters using vocab
decoded_string = ''.join(vocab[i] for i in final.tolist())

print("Decoded string:", decoded_string)  # → 'bbc'


tensor([1, 0, 2, 0, 2, 3, 0])
tensor([1, 2, 2, 3])
Decoded string: abbc


In [15]:
mask

tensor([[ True, False, False,  True, False, False,  True, False,  True, False,
         False]])

In [36]:
import numpy as np
c = np.array(['a','b','c','d'])

4

In [4]:
c[filtered[0]]

array(['b', 'c', 'd'], dtype='<U1')

In [21]:
from transformers import ClapProcessor, ClapModel
import torch
import torchaudio
import soundfile as sf
import numpy as np

# Load model and processor
processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
model = ClapModel.from_pretrained("laion/clap-htsat-unfused")

# Load and preprocess audio
def load_audio(path):
    waveform, sr = torchaudio.load(path)
    if sr != 48000:
        resampler = torchaudio.transforms.Resample(sr, 48000)
        waveform = resampler(waveform)
    return waveform

audio_path = "/raid/home/rajivratn/hemant_rajivratn/librispeech/data/train/audio/train-clean-100_126791_298_53.wav"
audio = load_audio(audio_path)

# CLAP expects mono
if audio.shape[0] > 1:
    audio = torch.mean(audio, dim=0, keepdim=True)

# Pad or truncate to 10s (CLAP expects 480000 samples)
audio = torch.nn.functional.pad(audio, (0, max(0, 480000 - audio.shape[1])))[:, :480000]



In [24]:
"SPEAK I BEG WITHOUT DREAD OF MY DISPLEASURE SAID FRANCES RETURNING THE GOOD HUMORED SMILE OF THE TROOPER WITH THE ARCHNESS NATURAL TO HER OWN SWEET FACE THE ODORS OF YOUR KITCHEN THEN CRIED LAWTON BLUNTLY FORBID MY QUITTING THE DOMAINS UNTIL I QUALIFY MYSELF TO SPEAK WITH MORE CERTAINTY CONCERNING THE FATNESS OF THE LAND".lower()

'speak i beg without dread of my displeasure said frances returning the good humored smile of the trooper with the archness natural to her own sweet face the odors of your kitchen then cried lawton bluntly forbid my quitting the domains until i qualify myself to speak with more certainty concerning the fatness of the land'

In [30]:

# Process audio and text
texts = ["hello how are you", "this is a dog barking", "there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with", "speak i beg without dread of my displeasure said frances returning the good humored smile of the trooper with the archness natural to her own sweet face the odors of your kitchen then cried lawton bluntly forbid my quitting the domains until i qualify myself to speak with more certainty concerning the fatness of the land"]
inputs = processor(audios=audio.squeeze().numpy(), text=texts, return_tensors="pt", padding=True)

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)
    audio_embeds = outputs.audio_embeds
    text_embeds = outputs.text_embeds

It is strongly recommended to pass the `sampling_rate` argument to `ClapFeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.


In [31]:

# Normalize embeddings
audio_embeds = torch.nn.functional.normalize(audio_embeds, dim=-1)
text_embeds = torch.nn.functional.normalize(text_embeds, dim=-1)

# Cosine similarity
similarities = torch.matmul(audio_embeds, text_embeds.T).squeeze(0)

# Print ranked results
for text, score in sorted(zip(texts, similarities), key=lambda x: -x[1]):
    print(f"{text:<30} -> Similarity: {score.item():.4f}")


speak i beg without dread of my displeasure said frances returning the good humored smile of the trooper with the archness natural to her own sweet face the odors of your kitchen then cried lawton bluntly forbid my quitting the domains until i qualify myself to speak with more certainty concerning the fatness of the land -> Similarity: 0.5616
there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with there they say is no debt and there are a girl with -> Similarity: 0.5157
hello how are you              -> Similarity: 0.0358
this is a dog barking          -> Similarity: -0.1362


In [None]:
import kenlm
model = kenlm.Model("/raid/home/rajivratn/hemant_rajivratn/grpo/3-gram.pruned.1e-7.arpa.gz")

sentence = "this is a test".upper()
print(sentence)

# Each tuple means “this word had log10-prob logp given the previous context; ng_len is the n-gram order (e.g. 3 means a trigram was used), and oov marks out-of-vocabulary
for (logp, ng_len, oov) in model.full_scores(sentence, bos=False, eos=False):
    print(logp, ng_len, oov)

In [None]:
def unigram(sentences):
    for s in sentences:
        log10p = [model.score(w, bos=False, eos=False) for w in sentence.split() if model.vocab_index(w) != 0]

unigram([sentence])

In [None]:
from collections import Counter

def get_unigram_char_probs(sentences):
    # Flatten to a list of characters
    chars = [char for sentence in sentences for char in sentence.strip()]
    
    # Count frequency of each character
    char_counts = Counter(chars)
    total_chars = sum(char_counts.values())

    # Convert to probabilities
    char_probs = {char: count / total_chars for char, count in char_counts.items()}
    
    return char_probs, chars

# Example usage
sentences = [
    'THANK YOU AGAIN MISTER DEVANT HE SAID...',
    'FOR WEEKS IT NEVER CAME TO MY TURN...',
    'ONE MAN WON PAST ME INDEED DARTING...'
]

char_probs, chars = get_unigram_char_probs(sentences)

# Print sorted for readability
for char, prob in sorted(char_probs.items()):
    print(f"'{char}': {prob:.5f}")

In [1]:
with open("/raid/home/rajivratn/hemant_rajivratn/last/data/txt/train_norm.txt", "r") as f:
    out = [i for i in f.readlines() if len(i.strip()) > 0]

In [None]:
from tqdm import tqdm
def preprocess_char_lm(sentences):
    """Prep a list of sentences for char-level n-gram LM training."""
    preprocessed = []
    for s in tqdm(sentences):
        s = s.strip().replace(" ", "|")
        chars = list(s)
        line = "<s> " + " ".join(chars) + " </s>"
        preprocessed.append(line)
    return preprocessed

char_lm_lines = preprocess_char_lm(out)

100%|██████████| 40407482/40407482 [01:48<00:00, 373406.29it/s]


In [3]:
output_path = "char_lm_input.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for line in char_lm_lines:
        f.write(line + "\n")

print(f"Saved to {output_path}")


Saved to char_lm_input.txt


In [None]:
import torch
@torch.jit.script
def beam_search(log_probs: torch.Tensor, beam_size: int):
    """
    Performs beam search on a tensor of log probabilities.

    Args:
        log_probs (torch.Tensor): Tensor of shape (b, t, v) containing log probabilities.
        beam_size (int): Number of beams to keep at each time step.

    Returns:
        sequences (torch.Tensor): Tensor of shape (b, beam_size, t) containing the top sequences.
        scores (torch.Tensor): Tensor of shape (b, beam_size) containing the scores of the top sequences.
    """
    
    b, t, v = log_probs.size()
    
    initial_beam_size = min(beam_size, v) # At the very first step (time step 0), we can't have more beams than the vocabulary size. This line ensures that the initial number of beams considered doesn't exceed the number of possible first tokens.

    topk_scores, topk_indices = torch.topk(log_probs[:, 0, :], initial_beam_size, dim=-1) # Returns the k largest elements of the given input tensor along a given dimension
    sequences = topk_indices.unsqueeze(-1)  # (b, initial_beam_size, 1)
    scores = topk_scores  # (b, initial_beam_size)

    for step in range(1, t):
        # Expand the current sequences with all possible next tokens
        current_log_probs = log_probs[:, step, :].unsqueeze(1)  # (b, 1, v)
        expanded_scores = scores.unsqueeze(-1) + current_log_probs  # (b, beam_size, v)
        flat_scores = expanded_scores.view(b, -1)  # (b, beam_size * v)

        # Select the top-k scores and their corresponding indices
        topk_flat_scores, topk_indices = flat_scores.topk(beam_size, dim=-1)  # (b, beam_size)
        beam_indices = topk_indices // v  # Indices of sequences to expand
        token_indices = topk_indices % v  # New tokens to append

        # Gather the sequences to expand and append the new tokens
        sequences = torch.gather(sequences, 1, beam_indices.unsqueeze(-1).expand(-1, -1, sequences.size(-1)))
        sequences = torch.cat([sequences, token_indices.unsqueeze(-1)], dim=-1)  # (b, beam_size, step+1)

        # Update the scores
        scores = topk_flat_scores

    return sequences, scores.unsqueeze(-1)



In [None]:
batch_size = 2
sequence_length = 5
vocab_size = 3
beam_size = 2

# Simulate log probabilities
log_probs = torch.randn(batch_size, sequence_length, vocab_size).log_softmax(dim=-1)
device = torch.device('cpu')
log_probs = log_probs.to(device)


In [None]:
log_probs[1,3:,:] = -float("1000000")

In [None]:
log_probs

In [None]:
# Perform beam search
sequences, scores = beam_search(log_probs, beam_size)

print("Top sequences:", sequences) # bsz, beamsize,seq_len
print("Scores:", scores) # bsz, beamsize,1

In [None]:
sequences.shape, scores.shape

In [None]:
path_probs = torch.gather(log_probs, 2, sequences.transpose(1,2)).transpose(1,2) # bsz, beamsize, T
path_probs

In [None]:
mean = scores.mean(dim=1, keepdim=True)
std = scores.std(dim=1, keepdim=True)

scores = (scores - mean) / std

In [None]:
path_probs*scores

In [None]:
sequences # bsz,beam,T

In [None]:
# If using PyTorch
import torch

import re
import numpy as np


vocab = [' ', "'", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '?']
vocab = np.array(vocab)


# Precompile regex to remove blanks and collapse repeats
blank_char = re.escape('?')  # adjust if blank differs
remove_blanks = re.compile(blank_char)
collapse_repeats = re.compile(r'(.)\1+')

# Convert token sequences to strings using regex merge
def decode(arr, vocab):
    raw = [collapse_repeats.sub(r'\1', remove_blanks.sub('', ''.join(vocab[row]))) for row in arr]
    return raw


In [None]:
# Convert token sequences to strings using regex merge
def decode(arr, vocab):
    raw = [collapse_repeats.sub(r'\1', remove_blanks.sub('', ''.join(vocab[row]))) for row in arr]
    return raw

sentences = decode(sequences[0].cpu(), vocab)
sentences

In [None]:
sequences[0], vocab

In [None]:
arr = sequences[1,:,:]
arr

In [None]:
import numpy as np
vocab_arr = np.array(vocab)

decoded = [''.join(vocab_arr[row]) for row in arr]
print(decoded)  # ['abc', 'cba']


In [None]:
[collapse_repeats.sub(r'\1', remove_blanks.sub('', ''.join(vocab[row]))) for row in arr]

In [None]:
[''.join(vocab[row]) for row in arr]

In [None]:
decode(arr, vocab)

In [None]:
'-'.join(vocab[np.array(arr)])

In [None]:
import numpy as np

# Step 1: vocab and array of indices
vocab = ['a', 'b', 'c', 'd', 'e']
vocab_arr = np.array(vocab)  # shape (V,)
arr = np.array([
    [0, 1, 2],
    [2, 3, 4],
])  # shape (2, 3)

# Step 2: index vocab
char_matrix = vocab_arr[arr]  # shape (2, 3), dtype='<U1'

# Step 3: vectorized join — this is the key step!
joined = np.char.join('-', char_matrix)  # shape (2,), dtype='<U5' etc.

print(joined)  # Output: ['a-b-c' 'c-d-e']


In [None]:
decoded = ['-'.join(vocab[row]) for row in arr]
decoded

In [None]:

arr = torch.tensor([[0, 1, 2], [2, 1, 0]])
decoded = [''.join(vocab_arr[row]) for row in arr.numpy()]
print(decoded)  # ['abc', 'cba']


In [None]:
sentences = [decode_one(seq, vocab) for seq in sequences[0,:,:].cpu().numpy()]

In [None]:
# 1) move to CPU & to plain Python list of lists
sentences = []
for b in range(sequences.shape[0]):
    print(b)
    rows = sequences[b].cpu().tolist() # beam,T
    decoded_beams = [ctc_merge_string( ''.join(idx2char[i] for i in row) ) for row in rows]
    sentences.append(decoded_beams)
print(sentences)

In [None]:
sentences

In [None]:
def _decode_one( seq):
    chars = [vocab[i] for i in seq]
    raw = ''.join(chars)
    return raw


In [None]:
sequences.shape

In [None]:
_decode_one(sequences[0,0])

In [None]:
"""
1.  beamctc decoder
2. 


"""