In [1]:
import pandas as pd
import transformers
import tqdm
import torch
from reformer_pytorch import ReformerEncDec
from collections import defaultdict, Counter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = "../data/cleaned/train_clean.csv"
val_data = "../data/cleaned/dev_clean.csv"
test_data = "../data/cleaned/test_clean.csv"

In [3]:
df_train = pd.read_csv(train_data)
df_val = pd.read_csv(val_data)
df_test = pd.read_csv(test_data)

In [4]:
df_train.head()

Unnamed: 0,transcript,episode_description
0,welcome to medicus a student run podcast about...,in this episode we sat down with a third year ...
1,hey what is up everybody and welcome back to t...,we are so excited to be back with you guys wit...
2,good morning my people i ll be going well in t...,can animals reduce stress or increase it i hav...
3,this is christy mathewson part of the texas a ...,surgery for biliary tract disease is among the...
4,welcome to episode number 2 of the av a moveme...,in this second episode of the podcast our co h...


In [5]:
df_val.head()

Unnamed: 0,transcript,episode_description
0,what s up you guys it s telling and ashley wit...,ashley and dallin discuss different approaches...
1,three two one and stop popping step podcast th...,today puff and steph talk about looking out fo...
2,you want to go all the content you can yeah ex...,what do aegee skopje aegee bratislava aegee ky...
3,i m talking about today five nights at freddy ...,finally 2 years since it has been revealed her...
4,hey good morning good afternoon and good eveni...,explicit language included for your benefit we...


In [6]:
df_test.head()

Unnamed: 0,transcript,episode_description
0,welcome back to another episode of tuxedo time...,today on the podcast we go on a journey we tal...
1,what s up guys this episode of the podcast is ...,ever wanted a podcast from your three favorite...
2,you are listening to irish illustrate insider ...,the irish illustrated insider crew discusses n...
3,you have tuned into irish illustrated insider ...,irish illustrated insider tackles nfl combine ...
4,what s up everybody welcome to the in the dome...,breaking down a classic calgary flames comebac...


In [7]:
print("Length of training data: ", len(df_train))
print("Length of validation data: ", len(df_val))
print("Length of test data: ", len(df_test))

Length of training data:  52396
Length of validation data:  2183
Length of test data:  1025


In [8]:
# tokenize the data
word_freq = defaultdict(int)
word_freq['PAD'] = 0
i = 1
for idx, row in df_train.iterrows():
    transcript = row['transcript'].split(" ")
    for word in transcript:
        if word not in word_freq:
            word_freq[word]= i
            i+=1

In [9]:
len(word_freq)

146002

In [10]:
word_freq["unk"] = 146002

In [11]:
def tokenize_data(text):
    max_size = 4096
    tokens = []
    for word in text[:4096]:
        if word in word_freq:
            tokens.append(word_freq[word])
        else:
            tokens.append(word_freq['unk'])
    if len(tokens)<4096:
        for i in range(len(tokens), 4096):
            tokens.append(word_freq['PAD'])
    return tokens

In [12]:
df_train['transcript'] = df_train['transcript'].apply(tokenize_data)
df_val['transcript'] = df_val['transcript'].apply(tokenize_data)
df_test['transcript'] = df_test['transcript'].apply(tokenize_data)

In [13]:
df_train

Unnamed: 0,transcript,episode_description
0,"[2776, 1856, 6535, 3096, 1957, 76, 1856, 14600...",in this episode we sat down with a third year ...
1,"[3055, 1856, 2024, 146002, 2776, 3055, 4, 81, ...",we are so excited to be back with you guys wit...
2,"[2157, 1957, 1957, 259, 146002, 76, 1957, 954,...",can animals reduce stress or increase it i hav...
3,"[81, 3055, 63, 203, 146002, 63, 203, 146002, 3...",surgery for biliary tract disease is among the...
4,"[2776, 1856, 6535, 3096, 1957, 76, 1856, 14600...",in this second episode of the podcast our co h...
...,...,...
52391,"[2024, 1957, 391, 146002, 4, 954, 1856, 146002...",omg may sex video ka most probably the first r...
52392,"[3055, 1856, 6535, 6535, 1957, 146002, 1856, 1...",where have i been and why i haven t done podca...
52393,"[2776, 1957, 2776, 146002, 2776, 1856, 146002,...",blake webber and steve welcome guest and world...
52394,"[2776, 1856, 146002, 2776, 1957, 391, 6535, 25...",once again we welcome becky to our podcast tod...


In [14]:
df_train = df_train.dropna()
df_test = df_test.dropna()

In [15]:
def tokenize_data(text):
    max_size = 4096
    tokens = []
    for word in text[:4096]:
        if word in word_freq:
            tokens.append(word_freq[word])
        else:
            tokens.append(word_freq['unk'])
    if len(tokens)<1024:
        for i in range(len(tokens), 4096):
            tokens.append(word_freq['PAD'])
    return tokens

In [16]:
df_train['episode_description'] = df_train['episode_description'].apply(tokenize_data)
df_val['episode_description'] = df_val['episode_description'].apply(tokenize_data)
df_test['episode_description'] = df_test['episode_description'].apply(tokenize_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['episode_description'] = df_train['episode_description'].apply(tokenize_data)


In [17]:
df_train.head()

Unnamed: 0,transcript,episode_description
0,"[2776, 1856, 6535, 3096, 1957, 76, 1856, 14600...","[63, 4637, 146002, 81, 3055, 63, 203, 146002, ..."
1,"[3055, 1856, 2024, 146002, 2776, 3055, 4, 81, ...","[2776, 1856, 146002, 4, 954, 1856, 146002, 203..."
2,"[2157, 1957, 1957, 259, 146002, 76, 1957, 954,...","[3096, 4, 4637, 146002, 4, 4637, 63, 76, 4, 65..."
3,"[81, 3055, 63, 203, 146002, 63, 203, 146002, 3...","[203, 391, 954, 2157, 1856, 954, 2024, 146002,..."
4,"[2776, 1856, 6535, 3096, 1957, 76, 1856, 14600...","[63, 4637, 146002, 81, 3055, 63, 203, 146002, ..."


In [18]:
train_doc = torch.empty((len(df_train), 4096))
for i,row in enumerate(df_train['transcript']):
    train_doc[i] = torch.tensor(row)

In [19]:
train_doc.requires_grad = True

In [20]:
train_doc.shape

torch.Size([52381, 4096])

In [21]:
train_sum = torch.empty((len(train_doc), 4096))
for i,row in enumerate(df_train['episode_description']):
    train_sum[i] = torch.tensor(row)

In [22]:
train_sum.requires_grad=True

In [23]:
train_sum.shape

torch.Size([52381, 4096])

In [31]:
import torch
from reformer_pytorch import ReformerEncDec

DE_SEQ_LEN = 4096
EN_SEQ_LEN = 4096

enc_dec = ReformerEncDec(
    dim = 512,
    enc_num_tokens = 146003,
    enc_depth = 6,
    enc_max_seq_len = DE_SEQ_LEN,
    dec_num_tokens = 146003,
    dec_depth = 6,
    dec_max_seq_len = EN_SEQ_LEN
).cuda()

# train_seq_in = torch.randint(0, 20000, (2, DE_SEQ_LEN)).long()
# train_seq_out = torch.randint(0, 20000, (2, EN_SEQ_LEN)).long()
batch_size = 32
for i in range(0,len(train_doc), batch_size):
    doc = train_doc[i:i+batch_size]
    summ = train_sum[i:i+batch_size]
    train_seq_in = doc.long().cuda()
    train_seq_out = summ.long().cuda()
    print(train_seq_in.shape)
    print(train_seq_out.shape)
    input_mask = torch.ones(len(doc), DE_SEQ_LEN).bool().cuda()

    loss = enc_dec(train_seq_in, train_seq_out, return_loss = True, enc_input_mask = input_mask)
    print("Loss: ", loss)
    loss.backward()
# # learn

# # evaluate with the following
# eval_seq_in = torch.randint(0, 20000, (1, DE_SEQ_LEN)).long().cuda()
# eval_seq_out_start = torch.tensor([[0.]]).long().cuda() # assume 0 is id of start token
# samples = enc_dec.generate(eval_seq_in, eval_seq_out_start, seq_len = EN_SEQ_LEN, eos_token = 1) # assume 1 is id of stop token
# print(samples.shape) # (1, <= 1024) decode the tokens

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.