In [1]:
import pickle 
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch.nn as nn

def load_pickle(pickle_file):
    try:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f)
    except UnicodeDecodeError as e:
        with open(pickle_file, 'rb') as f:
            pickle_data = pickle.load(f, encoding='latin1')
    except Exception as e:
        print('Unable to load data ', pickle_file, ':', e)
        raise
    return pickle_data


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
'''
you can assign the maximum number number of sentences in context and what will be the maximum number of words of any sentence.

It will do left padding . It will concatenate the word embedding + covarep features + openface features

example:

if max_sen_len = 20 then the punchline sentence dimension = 20 * 752. 
    where 752 = word embedding (300) + covarep (81) + openface(371)  

if max_sen_len = 20 and max_context_len = 5 that means context can have maximum 5 sentences 
and each sentence will have maximum 20 words. The context dimension will be 5 * 20 * 752 

We will do left padding with zeros to maintaing the same dimension.

In our experiments we set max_sen_len = 20 & max_context_len = 5 
'''


class HumorDataset(Dataset):
    
    def __init__(self, id_list,max_context_len=5,max_sen_len=20):
        self.id_list = id_list
        openface_file="openface_features_sdk.pkl"
        covarep_file="covarep_features_sdk.pkl"
        language_file="language_sdk.pkl"
        word_embedding_list_file="word_embedding_list.pkl"
        humor_label_file="humor_label_sdk.pkl"
        
        self.word_aligned_openface_sdk=load_pickle(openface_file)
        self.word_aligned_covarep_sdk=load_pickle(covarep_file)
        self.language_sdk=load_pickle(language_file)
        self.word_embedding_list_sdk=load_pickle(word_embedding_list_file)
        self.humor_label_sdk = load_pickle(humor_label_file)
        self.of_d=371
        self.cvp_d=81
        self.glove_d=300
        self.total_dim=self.glove_d+self.of_d+self.cvp_d
        self.max_context_len=max_context_len
        self.max_sen_len=max_sen_len
    
    #left padding with zero  vector upto maximum number of words in a sentence * glove embedding dimension 
    def paded_word_idx(self,seq,max_sen_len=20,left_pad=1):
        seq=seq[0:max_sen_len]
        pad_w=np.concatenate((np.zeros(max_sen_len-len(seq)),seq),axis=0)
        pad_w=np.array([self.word_embedding_list_sdk[int(w_id)] for  w_id in pad_w])
        return pad_w
    
    #left padding with zero  vector upto maximum number of words in a sentence * covarep dimension 
    def padded_covarep_features(self,seq,max_sen_len=20,left_pad=1):
        seq=seq[0:max_sen_len]
        return np.concatenate((np.zeros((max_sen_len-len(seq),self.cvp_d)),seq),axis=0)
    
    #left padding with zero  vector upto maximum number of words in a sentence * openface dimension 
    def padded_openface_features(self,seq,max_sen_len=20,left_pad=1):
        seq=seq[0:max_sen_len]
        return np.concatenate((np.zeros(((max_sen_len-len(seq)),self.of_d)),seq),axis=0)
    
    #left padding with zero vectors upto maximum number of sentences in context * maximum num of words in a sentence * 456
    def padded_context_features(self,context_w,context_of,context_cvp,max_context_len=5,max_sen_len=20):
        context_w=context_w[-max_context_len:]
        context_of=context_of[-max_context_len:]
        context_cvp=context_cvp[-max_context_len:]

        padded_context=[]
        for i in range(len(context_w)):
            p_seq_w=self.paded_word_idx(context_w[i],max_sen_len)
            p_seq_cvp=self.padded_covarep_features(context_cvp[i],max_sen_len)
            p_seq_of=self. padded_openface_features(context_of[i],max_sen_len)
            padded_context.append(np.concatenate((p_seq_w,p_seq_cvp,p_seq_of),axis=1))

        pad_c_len=max_context_len-len(padded_context)
        padded_context=np.array(padded_context)
        
        #if there is no context
        if not padded_context.any():
            return np.zeros((max_context_len,max_sen_len,self.total_dim))
        
        return np.concatenate((np.zeros((pad_c_len,max_sen_len,self.total_dim)),padded_context),axis=0)
    
    def padded_punchline_features(self,punchline_w,punchline_of,punchline_cvp,max_sen_len=20,left_pad=1):
        
        p_seq_w=self.paded_word_idx(punchline_w,max_sen_len)
        p_seq_cvp=self.padded_covarep_features(punchline_cvp,max_sen_len)
        p_seq_of=self.padded_openface_features(punchline_of,max_sen_len)
        return np.concatenate((p_seq_w,p_seq_cvp,p_seq_of),axis=1)
        
    
    def __len__(self):
        return len(self.id_list)
    
    def __getitem__(self,index):
        
        hid=self.id_list[index]
        punchline_w=np.array(self.language_sdk[hid]['punchline_embedding_indexes'])
        punchline_of=np.array(self.word_aligned_openface_sdk[hid]['punchline_features'])
        punchline_cvp=np.array(self.word_aligned_covarep_sdk[hid]['punchline_features'])
        
        context_w=np.array(self.language_sdk[hid]['context_embedding_indexes'])
        context_of=np.array(self.word_aligned_openface_sdk[hid]['context_features'])
        context_cvp=np.array(self.word_aligned_covarep_sdk[hid]['context_features'])
        
        #punchline feature
        x_p=torch.LongTensor(self.padded_punchline_features(punchline_w,punchline_of,punchline_cvp,self.max_sen_len))
        #context feature
        x_c=torch.LongTensor(self.padded_context_features(context_w,context_of,context_cvp,self.max_context_len,self.max_sen_len))
        
        y=torch.FloatTensor([self.humor_label_sdk[hid]])
                
        return x_p,x_c,y
        

In [7]:
data_folds_file="/home/aobolens/urfunny/data_folds.pkl"
data_folds=load_pickle(data_folds_file)
train=data_folds['train']
dev=data_folds['dev']
test=data_folds['test']

994

In [8]:
language_sdk = load_pickle("/home/aobolens/urfunny/language_sdk.pkl")
with open('language.txt', 'w') as f:
    for k, v in language_sdk.items():
        f.write(str(k) + '\n')
        f.write(v.__str__() + '\n')

In [8]:
ends = {k: v['punchline_intervals'][-1][1] for k, v in language_sdk.items()}

max(ends.values())

463.986

In [19]:
for k, v in language_sdk.items():
    print(k)

1
3
4
5
7
9
10
11
12
13
15
16
17
18
19
20
22
24
25
26
27
29
30
31
32
33
35
36
38
39
40
41
43
45
46
48
49
50
51
52
53
54
55
56
57
59
61
62
63
66
68
69
70
71
72
75
76
77
78
82
83
84
85
86
87
88
89
90
91
92
94
95
96
98
99
100
101
103
104
108
109
110
112
113
114
117
118
120
121
122
123
125
127
128
129
131
133
134
135
136
137
139
140
142
144
145
146
148
151
152
153
154
156
158
159
160
162
163
165
166
167
168
170
171
173
174
176
177
178
181
182
183
184
186
187
188
189
190
191
196
197
198
199
200
201
202
204
205
206
207
208
209
210
212
213
214
215
216
217
218
220
221
223
224
225
227
228
229
232
235
236
237
239
240
242
245
248
250
252
253
254
255
256
259
260
262
263
265
269
279
280
281
282
284
286
287
288
290
291
327
332
333
334
335
336
337
338
340
341
342
343
344
345
346
347
349
351
354
355
356
358
359
361
362
363
364
368
369
370
371
372
373
374
376
377
378
379
380
381
382
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
400
401
402
403
404
405
406
407
408
411
412
413
414
415
416
4

In [27]:
import pandas as pd
labels_sdk = load_pickle("/home/aobolens/urfunny/humor_label_sdk.pkl")
assert len(labels_sdk) == len(language_sdk)
idx = []
labels = []
texts = []
ends = []
for k in dev:
    v = language_sdk[k]
    idx.append(k)
    labels.append(labels_sdk[k])
    text = ' '.join(v['context_sentences']) + ' ' + v['punchline_sentence']
    texts.append(text)
    ends.append(v['punchline_intervals'][-1][-1])
data = pd.DataFrame.from_dict({'idx': idx, 'label': labels, 'text': texts, 'end': ends})
data.to_csv('dev.csv', index=False)

In [19]:
len(train)

7614

In [1]:
language_sdk = load_pickle(os.environ['LANGUAGE_PATH'])
print('len before', len(language_sdk))
language_sdk = {k: v for k, v in language_sdk.items() if v['punchline_intervals'][-1][-1] > MEGA_WINDOW_SIZE + 1 }
print('len after', len(language_sdk))

NameError: name 'load_pickle' is not defined

In [16]:
labels = load_pickle('/home/aobolens/urfunny/humor_label_sdk.pkl')
labels


{1: 1,
 3: 1,
 4: 1,
 5: 1,
 7: 1,
 9: 1,
 10: 1,
 11: 1,
 12: 0,
 13: 0,
 15: 0,
 16: 0,
 17: 0,
 18: 0,
 19: 0,
 20: 0,
 22: 0,
 24: 1,
 25: 1,
 26: 1,
 27: 1,
 29: 1,
 30: 1,
 31: 1,
 32: 0,
 33: 0,
 35: 0,
 36: 0,
 38: 0,
 39: 0,
 40: 1,
 41: 0,
 43: 0,
 45: 1,
 46: 1,
 48: 0,
 49: 0,
 50: 1,
 51: 1,
 52: 1,
 53: 1,
 54: 0,
 55: 0,
 56: 0,
 57: 0,
 59: 1,
 61: 1,
 62: 1,
 63: 1,
 66: 0,
 68: 0,
 69: 0,
 70: 1,
 71: 1,
 72: 1,
 75: 1,
 76: 1,
 77: 1,
 78: 1,
 82: 1,
 83: 1,
 84: 0,
 85: 0,
 86: 0,
 87: 0,
 88: 0,
 89: 0,
 90: 0,
 91: 0,
 92: 0,
 94: 0,
 95: 0,
 96: 0,
 98: 1,
 99: 1,
 100: 1,
 101: 1,
 103: 1,
 104: 1,
 108: 0,
 109: 0,
 110: 0,
 112: 0,
 113: 0,
 114: 0,
 117: 0,
 118: 1,
 120: 1,
 121: 0,
 122: 0,
 123: 0,
 125: 1,
 127: 1,
 128: 1,
 129: 1,
 131: 0,
 133: 0,
 134: 0,
 135: 0,
 136: 1,
 137: 1,
 139: 1,
 140: 1,
 142: 1,
 144: 0,
 145: 0,
 146: 0,
 148: 0,
 151: 1,
 152: 1,
 153: 0,
 154: 0,
 156: 1,
 158: 1,
 159: 1,
 160: 1,
 162: 1,
 163: 1,
 165: 1,
 166: 1,
 

In [5]:
from mreserve.lowercase_encoder import get_encoder
spans=[[229,318,1060,21,305,402,547,0,0,0,0,0
,0,0,0]
,[211,9861,23,2232,0,0,0,0,0,0,0,0
,0,0,0]
,[190,519,4772,337,207,0,0,0,0,0,0,0
,0,0,0]
,[3305,226,3482,23,210,666,0,0,0,0,0,0
,0,0,0]
,[5288,377,305,389,592,0,0,0,0,0,0,0
,0,0,0]
,[2898,21,215,230,587,210,1505,353,23,0,0,0
,0,0,0]
,[7416,402,569,207,210,328,210,0,0,0,0,0
,0,0,0]
,[1449,222,27234,21,546,5139,0,0,0,0,0,0
,0,0,0]
,[414,324,222,225,0,0,0,0,0,0,0,0
,0,0,0]
,[2257,23,694,0,0,0,0,0,0,0,0,0
,0,0,0]
,[481,1775,385,1297,460,469,0,0,0,0,0,0
,0,0,0]
,[425,4893,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[3482,4323,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[913,6036,18,304,3305,0,0,0,0,0,0,0
,0,0,0]
,[27612,3482,226,455,0,0,0,0,0,0,0,0
,0,0,0]
,[304,337,207,1286,23,246,1640,66,190,0,0,0
,0,0,0]
,[3509,190,1286,23,225,241,317,1449,0,0,0,0
,0,0,0]
,[2321,15248,21,0,0,0,0,0,0,0,0,0
,0,0,0]
,[1449,222,5028,241,233,0,0,0,0,0,0,0
,0,0,0]
,[6728,23,211,246,241,186,0,0,0,0,0,0
,0,0,0]
,[4042,8213,1416,1353,210,423,0,0,0,0,0,0
,0,0,0]
,[419,210,1105,481,0,0,0,0,0,0,0,0
,0,0,0]
,[5268,381,2286,0,0,0,0,0,0,0,0,0
,0,0,0]
,[3745,2505,226,874,695,0,0,0,0,0,0,0
,0,0,0]
,[270,266,190,1492,0,0,0,0,0,0,0,0
,0,0,0]
,[186,408,1263,363,0,0,0,0,0,0,0,0
,0,0,0]
,[405,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[215,230,273,0,0,0,0,0,0,0,0,0
,0,0,0]
,[190,491,321,7804,0,0,0,0,0,0,0,0
,0,0,0]
,[2059,16006,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[419,210,291,1726,215,230,1463,0,0,0,0,0
,0,0,0]
,[187,712,302,328,210,291,1443,13896,0,0,0,0
,0,0,0]
,[187,1027,210,186,1487,0,0,0,0,0,0,0
,0,0,0]
,[363,2988,67,210,0,0,0,0,0,0,0,0
,0,0,0]
,[329,278,955,378,190,7259,0,0,0,0,0,0
,0,0,0]
,[0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[210,2356,278,320,266,4579,0,0,0,0,0,0
,0,0,0]
,[67,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[320,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[363,2988,67,0,0,0,0,0,0,0,0,0
,0,0,0]
,[2988,67,1408,0,0,0,0,0,0,0,0,0
,0,0,0]
,[225,1088,0,0,0,0,0,0,0,0,0,0
,0,0,0]
,[725,440,1967,0,0,0,0,0,0,0,0,0
,0,0,0]
,[9540,210,666,302,238,320,0,0,0,0,0,0
,0,0,0]
,[716,2635,408,860,0,0,0,0,0,0,0,0
,0,0,0]
,[187,329,278,312,0,0,0,0,0,0,0,0
,0,0,0]
,[548,210,733,296,444,210,291,0,0,0,0,0
,0,0,0]
,[302,4162,211,0,0,0,0,0,0,0,0,0
,0,0,0]
,[6434,405,7658,222,0,0,0,0,0,0,0,0
,0,0,0]
,[190,28332,66,419,210,679,0,0,0,0,0,0
,0,0,0]
,[291,14980,281,210,0,0,0,0,0,0,0,0
,0,0,0]
,[215,297,1834,210,14224,378,215,0,0,0,0,0
,0,0,0]
,[6097,190,766,0,0,0,0,0,0,0,0,0
,0,0,0]
,[273,285,207,747,215,0,0,0,0,0,0,0
,0,0,0]
,[210,451,210,939,1562,0,0,0,0,0,0,0
,0,0,0]
,[363,2988,67,2595,230,367,0,0,0,0,0,0
,0,0,0]
,[12470,480,190,4054,5067,0,0,0,0,0,0,0
,0,0,0]
,[215,424,514,0,0,0,0,0,0,0,0,0
,0,0,0]
,[363,2988,67,186,0,0,0,0,0,0,0,0
,0,0,0]
,[187,795,451,528,267,3734,0,0,0,0,0,0
,0,0,0]
,[215,230,5059,0,0,0,0,0,0,0,0,0
,0,0,0]
,[6097,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]]
encoder = get_encoder()
for span in spans:
    print(encoder.decode(span))

 so at six, they will start
 and computers. within
 the first generation going to
 speaking in english. you should
 faces because they think their
 scene, it's something you must see.
 tears will come to you if you
 form of schooling, where mainly
 them out of that
 situation. another
 down across from someone who has
 more families
 english language
 (laughs) are speaking
 fluent english in our
 are going to school. this’s the
 enter the school. that is one form
 towards employment,
 form of education is we
 teachers. and this is a
 biophysics you've
 when you sit down
 dying little boys
 fishing trip in life call
 do for the city
 a good idea bec
 no
 it's just
 the way your brow
 furrows
 when you're thinking it's cute
 i mean not if you're playing poker
 i ask you a question
 beckett you
 don't care about the victims

 you aren't here for justice
t
 here
 beckett
kett sometimes
 that makes
 everything make sense
 circumstances you should not be here
 most smart good looking
 i don'

In [2]:

from mreserve.lowercase_encoder import get_encoder
spans=[270,266,190,1492,0,0,0,0,0,0,0,0,0,0
,0,186,408,1263,363,0,0,0,0,0,0,0,0,0
,0,0,405,0,0,0,0,0,0,0,0,0,0,0
,0,0,0]

encoder = get_encoder()
print(encoder.decode(spans))

 do for the city a good idea bec no


In [7]:
train_set = HumorDataset(train)
dev_set = HumorDataset(dev)
test_set = HumorDataset(test)

Unable to load data  openface_features_sdk.pkl : [Errno 2] No such file or directory: 'openface_features_sdk.pkl'


FileNotFoundError: [Errno 2] No such file or directory: 'openface_features_sdk.pkl'

In [11]:
folds = load_pickle("/home/aobolens/data_folds.pkl")
folds

{'train': [1,
  3,
  4,
  5,
  7,
  9,
  10,
  11,
  12,
  13,
  15,
  16,
  17,
  18,
  19,
  20,
  22,
  40,
  41,
  43,
  45,
  46,
  50,
  52,
  53,
  54,
  55,
  56,
  57,
  59,
  61,
  62,
  63,
  66,
  68,
  69,
  70,
  71,
  72,
  75,
  76,
  77,
  78,
  82,
  83,
  84,
  85,
  86,
  87,
  89,
  90,
  91,
  92,
  94,
  95,
  96,
  118,
  120,
  121,
  122,
  123,
  125,
  127,
  128,
  129,
  131,
  133,
  134,
  135,
  136,
  137,
  139,
  140,
  142,
  144,
  145,
  146,
  148,
  151,
  152,
  153,
  154,
  156,
  158,
  159,
  160,
  162,
  163,
  165,
  166,
  167,
  168,
  170,
  171,
  173,
  174,
  176,
  177,
  196,
  197,
  198,
  199,
  200,
  201,
  202,
  204,
  205,
  207,
  208,
  209,
  210,
  212,
  213,
  214,
  215,
  216,
  217,
  218,
  220,
  221,
  223,
  224,
  225,
  227,
  228,
  229,
  232,
  235,
  237,
  239,
  240,
  242,
  245,
  248,
  250,
  252,
  253,
  254,
  255,
  256,
  259,
  260,
  262,
  263,
  265,
  269,
  279,
  280,
  281,
  282,
  2

In [6]:
batch=10
train_dataloader = DataLoader(train_set, batch_size=batch, shuffle=True)
dev_dataloader = DataLoader(dev_set, batch_size=batch, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=1, shuffle=True)

In [7]:
'''
x_p.shape=batch_size*maximum number of words in sentence * 752
x_c.shape = batch_size * maximum context length in #sentences * maximum sentence length in #words * 752
here 752 = word embedding (300) + covarep (81) + openface(371)  
'''

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for  batch_idx, batch in enumerate(train_dataloader, 0): 
    x_p,x_c,y=map(lambda x: x.to(device), batch)
    print("*********")
    print("punchline shape: ",x_p.shape)
    print("context shape: ",x_c.shape)
    print("humor labels: ",y)
    if batch_idx==5:
        break

*********
punchline shape:  torch.Size([10, 20, 752])
context shape:  torch.Size([10, 5, 20, 752])
humor labels:  tensor([[1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.]])
*********
punchline shape:  torch.Size([10, 20, 752])
context shape:  torch.Size([10, 5, 20, 752])
humor labels:  tensor([[1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.]])
*********
punchline shape:  torch.Size([10, 20, 752])
context shape:  torch.Size([10, 5, 20, 752])
humor labels:  tensor([[0.],
        [1.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [1.]])
*********
punchline shape:  torch.Size([10, 20, 752])
context shape:  torch.Size([10, 5, 20, 752])
humor labels:  tensor([[0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [