In [3]:
import pandas as pd
from torchaudio.transforms import LFCC
import numpy as np

In [4]:
from transformers import AutoTokenizer, DistilBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
blogs = pd.read_csv('datasets/blog8965.csv')
blogs

Unnamed: 0,id,text,split
0,2032593,Everyone hates you. You should actually...,train
1,3350981,I'm not very consistent with this journ...,train
2,1930446,"Hi everyone, It was good to see all...",train
3,322624,destinie's being so produtive. ......,train
4,3464451,I'm finding my way back to sanity again...,train
...,...,...,...
542293,788927,I would like to do two things right now...,test
542294,2495927,I guess it just occured to me that ...,test
542295,756402,Life has def. been of the crazy lately....,test
542296,3707606,First full day off.... boring lol. ...,test


In [None]:
longer = blogs[blogs["text"].apply(str.split).apply(len) > 38]

In [83]:
def random_truncate(text):
    length = len(str(text).split())
    if length >= 512:
        start = np.random.randint(length - 511)
        text = ' '.join(text.split()[start:start+511])
    return text
        

In [82]:
longer["text"] = longer["text"].apply(random_truncate)
longer.to_csv('datasets/blog8965_tokenized.csv')

ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [84]:
tokenized = pd.read_csv('datasets/blog8965_tokenized.csv')

In [87]:
try_data = pd.read_csv('datasets/blog8965.csv.gz')
print(try_data.shape)
print(sum(tokenized["split"] == "test"))

(542298, 3)
45641


In [89]:
test = blogs[(blogs["split"] == "train") == False].drop("split", axis=1).reset_index(drop=True)
dataset = test.to_numpy()
dataset

array([[488068, 2521983,
        "{'input_ids': [101, 2073, 2024, 2017, 5557, 1029, 2070, 1997, 1045, 3246, 2031, 2657, 5557, 23699, 1012, 2065, 2025, 2059, 2017, 2031, 4771, 2041, 1012, 2023, 2158, 2018, 1037, 2200, 4310, 3168, 1997, 8562, 1012, 2009, 2001, 17704, 2066, 1037, 26418, 2021, 2196, 1037, 8595, 2240, 1012, 2002, 2001, 2006, 1055, 20554, 1006, 5095, 2305, 2444, 1007, 1010, 10095, 1010, 1998, 3365, 3922, 2006, 2397, 2305, 2831, 3065, 1012, 2002, 2036, 2018, 2019, 11477, 1011, 13059, 4116, 16271, 1006, 1999, 27263, 2000, 1996, 2157, 1007, 1012, 2002, 2036, 3046, 2000, 2191, 2111, 4025, 2008, 2002, 2347, 1005, 1056, 16271, 2011, 2383, 2010, 2190, 2767, 3960, 1062, 12274, 2850, 4685, 2004, 16271, 1998, 23699, 2052, 7180, 2006, 2754, 10514, 18098, 9355, 3071, 1012, 2092, 2002, 2864, 4490, 2035, 2105, 1996, 2163, 1012, 2002, 2079, 2195, 12741, 2004, 1037, 1005, 2005, 28872, 2078, 2158, 1005, 1010, 12280, 1010, 2002, 2052, 5423, 1011, 26351, 2205, 1996, 10478, 8000, 4323, 2299, 10

In [37]:
authors = np.unique(dataset[:, 0]).astype(int)
authors

In [38]:
data = blogs["text"].apply(str).apply(tokenizer).to_numpy()
data

array([{'input_ids': [101, 3071, 16424, 2017, 1012, 2017, 2323, 2941, 5136, 17573, 2007, 2115, 15174, 2121, 1012, 2123, 1005, 1056, 5293, 2000, 13354, 2009, 1999, 999, 18401, 2015, 2007, 26735, 2015, 1010, 21368, 2401, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
       {'input_ids': [101, 1045, 1005, 1049, 2025, 2200, 8335, 2007, 2023, 3485, 2518, 1010, 2572, 1045, 1029, 1045, 2074, 2359, 2000, 2360, 7632, 1998, 2292, 2017, 4364, 2113, 1045, 1005, 1049, 2145, 5505, 1012, 1012, 1012, 4066, 1997, 1012, 1045, 1005, 2310, 2042, 2012, 2026, 2905, 1005, 1055, 2160, 2652, 2568, 3238, 2678, 2399, 2005, 2146, 6993, 1997, 2051, 1012, 1998, 1045, 1005, 1049, 2188, 2005, 1996, 2279, 2261, 2420, 2030, 2061, 1012, 1012, 1012, 2138, 1045, 2123, 1005, 1056, 2215, 2000, 2175, 2067, 2157, 2085, 1012, 1045, 1005, 2310, 2042, 2770, 2013, 2870, 2005, 2205, 2146, 1010, 1045, 2064, 1005, 1056, 2562, 2770, 1012, 2009, 1005, 1055

In [39]:
# First ten blogs
blogs["text"].head(10).apply(str).apply(tokenizer)

0    [input_ids, attention_mask]
1    [input_ids, attention_mask]
2    [input_ids, attention_mask]
3    [input_ids, attention_mask]
4    [input_ids, attention_mask]
5    [input_ids, attention_mask]
6    [input_ids, attention_mask]
7    [input_ids, attention_mask]
8    [input_ids, attention_mask]
9    [input_ids, attention_mask]
Name: text, dtype: object

In [40]:
ids = np.unique(t_num[:,0])
ids

array([5114, 7596, 8173, ..., 4325889, 4326560, 4334761], dtype=object)

In [41]:
np.where((t_num[:,0]==5114))[0]

array([17059, 20235, 31640, 37654])

In [42]:
grouped_examples = {}
for id in ids:
    grouped_examples[id] = np.where((t_num[:,0]==id))[0]
grouped_examples

{5114: array([17059, 20235, 31640, 37654]),
 7596: array([ 2564, 15150]),
 8173: array([ 1489,  2124,  2495,  2617,  3546,  3794,  4102,  4121,  4258,
         4768,  5163,  5338,  6963,  7263,  7765,  9170,  9484, 10186,
        10415, 10813, 11554, 11688, 13098, 14033, 17069, 17515, 17599,
        17643, 17902, 19545, 20710, 20789, 20832, 21636, 21946, 22598,
        23148, 23972, 24527, 25261, 25716, 26575, 27582, 28382, 28746,
        29549, 29567, 30606, 31654, 32440, 32517, 32677, 32695, 32952,
        33152, 33450, 34756, 35856, 38262, 39209, 40285, 40415, 40422,
        40760, 40935, 43095, 44267, 44520, 45364, 45542]),
 8349: array([2318, 3804, 5324]),
 9289: array([  317,  1150,  2498,  9350, 14546, 17476, 27723, 37476]),
 9470: array([ 8751, 17497, 18785, 19524, 25490, 26248, 31996, 33162, 37116,
        41352, 42688]),
 11762: array([23907, 44614]),
 15365: array([  852,  1213,  2517,  3036,  4496,  4931,  5005,  6075,  8668,
        11983, 12046, 12383, 12392, 14496, 14670

In [43]:
t_num.shape[0]

45641

In [44]:
train_text, train_label = zip(*blogs.loc[blogs.split=='train'][['text', 'id']].itertuples(index=False))

In [45]:

bert = DistilBertModel.from_pretrained("distilbert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = bert(**inputs)

last_hidden_states = outputs.last_hidden_state

In [46]:
bert


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): Li

In [47]:
isinstance(bert, torch.nn.Module)

True

In [48]:
out = bert(**inputs)

In [49]:
last_hidden_states.shape

torch.Size([1, 8, 768])

In [50]:
inputs

{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [51]:
str1 = ("       destinie's being so produtive.      ... i made waffles - - - is that being productive, or just hungry?  o well, i'm off to make more waffles.         ")
words = str1.split()
subs = []
print(len(words))
for i in range(len(words)):
    sub = ' '.join(words[:i+1])
    subs.append(sub)
    
inputs = tokenizer(subs, return_tensors="pt", padding=True, truncation=True)
print(inputs['input_ids'].shape)
# Batch size of 56, sequence length of 74
outputs = bert(**inputs).last_hidden_state
print(outputs.shape)
# Batch size of 56, sequence length of 74 (includes punctuation and other things), hidden size of 768

26
torch.Size([26, 47])
torch.Size([26, 47, 768])


In [52]:
inputs['input_ids']

tensor([[ 101, 4078, 7629,  ...,    0,    0,    0],
        [ 101, 4078, 7629,  ...,    0,    0,    0],
        [ 101, 4078, 7629,  ...,    0,    0,    0],
        ...,
        [ 101, 4078, 7629,  ...,    0,    0,    0],
        [ 101, 4078, 7629,  ...,    0,    0,    0],
        [ 101, 4078, 7629,  ..., 2015, 1012,  102]])

In [53]:
outputs

tensor([[[-2.1546e-01, -1.0098e-01, -4.9310e-02,  ..., -5.9793e-02,
           2.1690e-01,  3.5028e-01],
         [ 3.3522e-01, -3.9764e-01, -3.3966e-02,  ..., -2.0957e-01,
           6.0333e-01,  3.3950e-01],
         [-6.9940e-02, -2.9391e-01,  3.0538e-02,  ..., -1.2230e-01,
           1.9144e-01,  9.7167e-02],
         ...,
         [-1.8653e-02, -7.3388e-02, -9.4943e-02,  ...,  9.4236e-02,
           1.7227e-02,  2.5121e-01],
         [-4.2073e-02,  5.9375e-02, -2.5555e-01,  ...,  1.9251e-01,
          -7.8020e-02,  4.1499e-01],
         [-5.2312e-02,  8.0215e-02, -2.7838e-01,  ...,  1.9307e-01,
          -8.8466e-02,  3.9446e-01]],

        [[-1.9067e-01, -1.3462e-01, -6.4990e-02,  ..., -5.1245e-02,
           2.6483e-01,  3.2349e-01],
         [ 3.8631e-01, -4.0439e-01,  7.6627e-02,  ..., -2.8961e-01,
           6.2650e-01,  3.3368e-01],
         [ 7.8836e-03, -2.1649e-01,  1.1352e-01,  ..., -1.9346e-01,
           2.2955e-01,  7.5553e-02],
         ...,
         [ 7.2044e-02, -1

In [54]:
t1 = torch.transpose(outputs, 1, 2)
t1

tensor([[[-2.1546e-01,  3.3522e-01, -6.9940e-02,  ..., -1.8653e-02,
          -4.2073e-02, -5.2312e-02],
         [-1.0098e-01, -3.9764e-01, -2.9391e-01,  ..., -7.3388e-02,
           5.9375e-02,  8.0215e-02],
         [-4.9310e-02, -3.3966e-02,  3.0538e-02,  ..., -9.4943e-02,
          -2.5555e-01, -2.7838e-01],
         ...,
         [-5.9793e-02, -2.0957e-01, -1.2230e-01,  ...,  9.4236e-02,
           1.9251e-01,  1.9307e-01],
         [ 2.1690e-01,  6.0333e-01,  1.9144e-01,  ...,  1.7227e-02,
          -7.8020e-02, -8.8466e-02],
         [ 3.5028e-01,  3.3950e-01,  9.7167e-02,  ...,  2.5121e-01,
           4.1499e-01,  3.9446e-01]],

        [[-1.9067e-01,  3.8631e-01,  7.8836e-03,  ...,  7.2044e-02,
          -1.2959e-04,  1.1087e-01],
         [-1.3462e-01, -4.0439e-01, -2.1649e-01,  ..., -1.0681e-01,
          -1.3243e-01, -4.4435e-02],
         [-6.4990e-02,  7.6627e-02,  1.1352e-01,  ...,  5.5805e-03,
          -4.3928e-02, -8.1176e-02],
         ...,
         [-5.1245e-02, -2

In [55]:
t2 = outputs.transpose(1, 2)
t2

tensor([[[-2.1546e-01,  3.3522e-01, -6.9940e-02,  ..., -1.8653e-02,
          -4.2073e-02, -5.2312e-02],
         [-1.0098e-01, -3.9764e-01, -2.9391e-01,  ..., -7.3388e-02,
           5.9375e-02,  8.0215e-02],
         [-4.9310e-02, -3.3966e-02,  3.0538e-02,  ..., -9.4943e-02,
          -2.5555e-01, -2.7838e-01],
         ...,
         [-5.9793e-02, -2.0957e-01, -1.2230e-01,  ...,  9.4236e-02,
           1.9251e-01,  1.9307e-01],
         [ 2.1690e-01,  6.0333e-01,  1.9144e-01,  ...,  1.7227e-02,
          -7.8020e-02, -8.8466e-02],
         [ 3.5028e-01,  3.3950e-01,  9.7167e-02,  ...,  2.5121e-01,
           4.1499e-01,  3.9446e-01]],

        [[-1.9067e-01,  3.8631e-01,  7.8836e-03,  ...,  7.2044e-02,
          -1.2959e-04,  1.1087e-01],
         [-1.3462e-01, -4.0439e-01, -2.1649e-01,  ..., -1.0681e-01,
          -1.3243e-01, -4.4435e-02],
         [-6.4990e-02,  7.6627e-02,  1.1352e-01,  ...,  5.5805e-03,
          -4.3928e-02, -8.1176e-02],
         ...,
         [-5.1245e-02, -2

In [56]:
torch.equal(t1, t2)

True

In [57]:
outputs.shape

torch.Size([26, 47, 768])

In [58]:
t1.shape

torch.Size([26, 768, 47])

In [59]:
cepstral = LFCC(n_lfcc=40, speckwargs={'n_fft': 80})

In [60]:
c1 = cepstral(outputs)

In [61]:
c1.shape

torch.Size([26, 47, 40, 20])

In [62]:
c2 = cepstral(t1)
c2.shape

torch.Size([26, 768, 40, 2])

In [63]:
c3 = c2.mean(3)
c3.shape

torch.Size([26, 768, 40])

In [64]:
c4 = c3.flatten(1)

In [65]:
c4.shape

torch.Size([26, 30720])

In [34]:
from torchaudio.transforms import Spectrogram

In [36]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [53]:
inputs = tokenizer("Hello, my dog is cuter than yours. Do you like my dog? Please pet him. He enjoys attention", return_tensors="pt", padding='max_length', truncation=True)
inputs['input_ids'].shape

torch.Size([1, 512])

In [54]:
bert = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [58]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
x = bert(input_ids, attention_mask=attention_mask).last_hidden_state
x.shape

torch.Size([1, 512, 768])

In [59]:
# Now x is a tensor of shape (batch_size, sequence_length, hidden_size)
# I want x to be (batch_size, hidden_size=768, sequence_length=512)
x = x.transpose(1, 2)
x.shape

torch.Size([1, 768, 512])

In [64]:
spec = Spectrogram(n_fft=512)

In [65]:
# cep
s = spec(x)
s.shape

torch.Size([1, 768, 257, 3])