In [210]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data.sampler import BatchSampler
from torch.autograd import Variable

In [211]:
reddit_jokes = pd.read_json('./joke_dataset/reddit_jokes.json')
reddit_jokes['body'] = reddit_jokes['title'] + ' \n ' + reddit_jokes['body']
del reddit_jokes['title']
reddit_jokes.head(2)

Unnamed: 0,body,id,score
0,I hate how you cant even say black paint anymo...,5tz52q,1
1,What's the difference between a Jew in Nazi Ge...,5tz4dd,0


In [212]:
reddit_jokes['body'].iloc[0]

u'I hate how you cant even say black paint anymore \n Now I have to say "Leroy can you please paint the fence?"'

In [213]:
jokes = reddit_jokes['body']
# Remove empty texts
jokes = jokes[~(jokes=='')]

# Ensure no nulls and no empty
assert sum(jokes=='')==0
assert sum(jokes.isnull())==0

In [214]:
# Lowercase
jokes = jokes.apply(lambda x: x.lower())

In [215]:
import random
for i in range(5):
    print(random.choice(list(jokes)))
    print('-----')

a penny lies below the cliff where a jew and black man jumps from, who wins? 
 we don't have enough information to conclude who's likely to win
-----
where did hitler hide his armies? 
 in his sleevies!
-----
what would the name be of a magician duo containing a chicken and a deaf woman? 
 hen and keller. 
-----
what is hitler's least favorite month? 
 jew-ly

edit: how about jan-jew-ary, or jewne?
-----
why did the chicken cross the road? 
 because that two-timing chicken head just couldn't resist that outside cock.  these chicks ain't loyal. the road trusted her to be faithful but in the end she just couldn't do it. that's ok though, because the road knows that karma is real. it will ensure that that chicken never crosses this road again.
-----


In [216]:
print len(jokes)
jokes = jokes[:5000]

194553


In [217]:
print len(jokes)

5000


In [218]:
y = np.zeros(len(jokes))

In [219]:
X_train_valid, X_test, y_train_valid, y_valid = train_test_split(jokes, y,test_size=0.2)

In [220]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid,test_size=0.2)

In [221]:
X_train.to_excel('./joke_dataset/train_dataset.xlsx',index=False)
X_test.to_excel('./joke_dataset/test_dataset.xlsx',index=False)
X_valid.to_excel('./joke_dataset/valid_dataset.xlsx',index=False)

---------------------------

In [222]:
import string

# Create token_dict based on ascii letters
all_letters = string.ascii_letters[0:26] + """ ".,;'-+=?!$%():\n"""
token_dict = {token:k for token,k in zip(all_letters, range(1,len(all_letters)+1))}

end_token = '<end>'
pad_token = '<pad>'

token_dict[pad_token] = 0
token_dict[end_token] = len(token_dict)

decoder_dict = {token_dict[k]:k for k in token_dict.keys()}
tokens_count = len(token_dict)
assert len(decoder_dict)==len(decoder_dict)
tokens_count

45

In [223]:
id_word = {}
for i,j in token_dict.iteritems():
    id_word[j]=i

In [224]:
def text_encoder(text):
    return [token_dict[token] for token in text if token in token_dict] + [token_dict['<end>']]

In [225]:
text_encoder("i am")

[9, 27, 1, 13, 44]

In [226]:
class CustomDataset():
    
    def __init__(self,filepath):
        self.data = pd.read_excel(filepath)
        self.texts = self.data.iloc[:,0]
        
    def __getitem__(self,index):
        data = self.texts[index]
        encoded_data = text_encoder(data)
        text = np.array(encoded_data[:-1])
        label = np.array(encoded_data[1:])
        return text, label
        
    def __len__(self):
        return (self.data.shape[0])

In [227]:
def my_collate(batch):
    data = [item[0] for item in batch]
    target = [item[1] for item in batch]
    #target = torch.LongTensor(target)
    return [data, target]

In [228]:
train_loader = DataLoader(CustomDataset('./joke_dataset/train_dataset.xlsx'),
                         batch_size=10,
                        shuffle=False,
                         collate_fn=my_collate)
valid_loader = DataLoader(CustomDataset('./joke_dataset/valid_dataset.xlsx'),
                         batch_size=10,
                         shuffle=False,
                         collate_fn=my_collate)

test_loader = DataLoader(CustomDataset('./joke_dataset/test_dataset.xlsx'),
                         batch_size=10,
                         shuffle=False,collate_fn=my_collate)

In [229]:
for i,j in valid_loader:
    print len(i)
    break

10


In [230]:
data_loaders = {"train": train_loader, "val": valid_loader}
data_lengths = {"train": train_loader.dataset.data.shape[0], "val": train_loader.dataset.data.shape[0]}

In [231]:
class LanguageModel(nn.Module):
    
    def __init__(self,tokens_count,embedding_dimension):
        super(LanguageModel,self).__init__()
        self.embedding = nn.Embedding(tokens_count, embedding_dimension)
        self.lstm = nn.LSTM(embedding_dimension,100,batch_first = True)
        self.linear = nn.Linear(100,tokens_count)
        
    def forward(self,x,seq_length):
        embedded = self.embedding(x)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, seq_length, batch_first=True)
        packed_output, (ht, ct) = self.lstm(packed_embedded)
        lstm_output, length = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        output = self.linear(lstm_output)
        return output

In [233]:
languagemodel = LanguageModel(tokens_count,100)

In [234]:
languagemodel.parameters

<bound method LanguageModel.parameters of LanguageModel(
  (embedding): Embedding(45, 100)
  (lstm): LSTM(100, 100, batch_first=True)
  (linear): Linear(in_features=100, out_features=45, bias=True)
)>

In [235]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(languagemodel.parameters(), lr=0.001)

In [236]:
def get_lengths(v):
    return np.array([i for i in map(len, v)])

def pad(v):
    lens = np.array([len(item) for item in v])
    mask = lens[:,None] > np.arange(lens.max())
    out = np.zeros(mask.shape,dtype=int)
    out[mask] = np.concatenate(v)
    return out

In [237]:
num_epoch = 15
train_losses = []
valid_losses = []

for epoch in range( num_epoch):
    
    print('Epoch {}/{}'.format(epoch, num_epoch- 1))
    print('-' * 10)
    
    # Each epoch has a training and validation phase
    for phase in ['train', 'val']:
        if phase == 'train':
            languagemodel.train() # Set model to training mode
        else:
            languagemodel.eval() # Set model to evaluate mode
        
        running_loss = 0.0
        # Iterate over data.
        
        for i, (texts,labels) in enumerate(data_loaders[phase]):
            
            texts.sort(key=len, reverse=True)
            labels.sort(key=len, reverse=True)
            
            texts_pad = pad(texts)
            labels_pad = pad(labels)
            # torch can only train on Variable, so convert them to Variable
            texts_pad = Variable(torch.LongTensor(texts_pad))
            labels_pad = Variable(torch.LongTensor(labels_pad))

            outputs = languagemodel(texts_pad,get_lengths(texts)) # Forward pass: compute the output class given a image

            #print outputs.view(-1,outputs.size(2)).size()
            #print outputs.contiguous().view(-1,tokens_count).size()

            #print labels_pad.view(-1).size()
            #print torch.LongTensor(pad(labels).flatten()).size()

            loss = criterion(outputs.view(-1,outputs.size(2)),labels_pad.view(-1)) # Compute the loss: difference between the output class and the pre-given label
            #print loss

            optimizer.zero_grad() # clear gradients for next train
            if phase == 'train':

                loss.backward() # backpropagation, compute gradients
                optimizer.step() # apply gradients  and update the weights of hidden nodes
                
            running_loss += loss.data * texts_pad.size(0)
            
            if phase == 'train':
                if (i+1) % 5 == 0 :
                    print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %(epoch+1, num_epoch, i+1, data_lengths[phase]//10, loss.data))
                    
        epoch_loss = running_loss / data_lengths[phase]
        #print epoch_loss
        if phase =='train':
            train_losses.append(epoch_loss)
        if phase =='val':
            valid_losses.append(epoch_loss)
            
        print('Epoch [{}/{}]{} Loss: {:.4f}'.format(epoch+1, num_epoch,phase, epoch_loss))    

Epoch 0/14
----------
Epoch [1/15], Step [5/320], Loss: 3.7224
Epoch [1/15], Step [10/320], Loss: 3.6737
Epoch [1/15], Step [15/320], Loss: 3.6117
Epoch [1/15], Step [20/320], Loss: 3.3702
Epoch [1/15], Step [25/320], Loss: 3.5598
Epoch [1/15], Step [30/320], Loss: 3.5075
Epoch [1/15], Step [35/320], Loss: 3.4607
Epoch [1/15], Step [40/320], Loss: 3.4033
Epoch [1/15], Step [45/320], Loss: 3.4864
Epoch [1/15], Step [50/320], Loss: 3.5094
Epoch [1/15], Step [55/320], Loss: 3.4248
Epoch [1/15], Step [60/320], Loss: 3.3937
Epoch [1/15], Step [65/320], Loss: 3.1433
Epoch [1/15], Step [70/320], Loss: 2.8606
Epoch [1/15], Step [75/320], Loss: 3.4746
Epoch [1/15], Step [80/320], Loss: 3.0530
Epoch [1/15], Step [85/320], Loss: 3.4350
Epoch [1/15], Step [90/320], Loss: 3.3687
Epoch [1/15], Step [95/320], Loss: 3.0790
Epoch [1/15], Step [100/320], Loss: 3.1513
Epoch [1/15], Step [105/320], Loss: 3.2930
Epoch [1/15], Step [110/320], Loss: 3.3498
Epoch [1/15], Step [115/320], Loss: 3.0772
Epoch [1/

Epoch [3/15], Step [305/320], Loss: 1.9877
Epoch [3/15], Step [310/320], Loss: 2.0390
Epoch [3/15], Step [315/320], Loss: 2.0764
Epoch [3/15], Step [320/320], Loss: 2.0308
Epoch [3/15]train Loss: 2.1952
Epoch [3/15]val Loss: 0.5004
Epoch 3/14
----------
Epoch [4/15], Step [5/320], Loss: 1.9920
Epoch [4/15], Step [10/320], Loss: 1.9809
Epoch [4/15], Step [15/320], Loss: 1.9838
Epoch [4/15], Step [20/320], Loss: 2.0012
Epoch [4/15], Step [25/320], Loss: 1.9743
Epoch [4/15], Step [30/320], Loss: 1.9933
Epoch [4/15], Step [35/320], Loss: 1.9727
Epoch [4/15], Step [40/320], Loss: 1.9793
Epoch [4/15], Step [45/320], Loss: 1.9563
Epoch [4/15], Step [50/320], Loss: 1.9351
Epoch [4/15], Step [55/320], Loss: 1.9354
Epoch [4/15], Step [60/320], Loss: 1.9280
Epoch [4/15], Step [65/320], Loss: 1.9178
Epoch [4/15], Step [70/320], Loss: 1.8474
Epoch [4/15], Step [75/320], Loss: 1.9058
Epoch [4/15], Step [80/320], Loss: 1.8875
Epoch [4/15], Step [85/320], Loss: 1.8949
Epoch [4/15], Step [90/320], Loss

Epoch [6/15], Step [280/320], Loss: 1.2805
Epoch [6/15], Step [285/320], Loss: 1.0767
Epoch [6/15], Step [290/320], Loss: 1.0808
Epoch [6/15], Step [295/320], Loss: 1.0842
Epoch [6/15], Step [300/320], Loss: 1.2008
Epoch [6/15], Step [305/320], Loss: 1.1462
Epoch [6/15], Step [310/320], Loss: 1.0442
Epoch [6/15], Step [315/320], Loss: 1.3442
Epoch [6/15], Step [320/320], Loss: 1.0361
Epoch [6/15]train Loss: 1.2431
Epoch [6/15]val Loss: 0.2825
Epoch 6/14
----------
Epoch [7/15], Step [5/320], Loss: 1.0791
Epoch [7/15], Step [10/320], Loss: 1.0614
Epoch [7/15], Step [15/320], Loss: 1.0520
Epoch [7/15], Step [20/320], Loss: 1.4460
Epoch [7/15], Step [25/320], Loss: 1.0019
Epoch [7/15], Step [30/320], Loss: 1.1090
Epoch [7/15], Step [35/320], Loss: 1.0990
Epoch [7/15], Step [40/320], Loss: 1.1837
Epoch [7/15], Step [45/320], Loss: 1.0196
Epoch [7/15], Step [50/320], Loss: 0.9385
Epoch [7/15], Step [55/320], Loss: 1.0274
Epoch [7/15], Step [60/320], Loss: 1.0271
Epoch [7/15], Step [65/320],

Epoch [9/15], Step [255/320], Loss: 1.2377
Epoch [9/15], Step [260/320], Loss: 0.7875
Epoch [9/15], Step [265/320], Loss: 0.7317
Epoch [9/15], Step [270/320], Loss: 1.1005
Epoch [9/15], Step [275/320], Loss: 0.7702
Epoch [9/15], Step [280/320], Loss: 0.9383
Epoch [9/15], Step [285/320], Loss: 0.6639
Epoch [9/15], Step [290/320], Loss: 0.6692
Epoch [9/15], Step [295/320], Loss: 0.6650
Epoch [9/15], Step [300/320], Loss: 0.8406
Epoch [9/15], Step [305/320], Loss: 0.7769
Epoch [9/15], Step [310/320], Loss: 0.6164
Epoch [9/15], Step [315/320], Loss: 1.0196
Epoch [9/15], Step [320/320], Loss: 0.6088
Epoch [9/15]train Loss: 0.8038
Epoch [9/15]val Loss: 0.1890
Epoch 9/14
----------
Epoch [10/15], Step [5/320], Loss: 0.6873
Epoch [10/15], Step [10/320], Loss: 0.6681
Epoch [10/15], Step [15/320], Loss: 0.6535
Epoch [10/15], Step [20/320], Loss: 1.1946
Epoch [10/15], Step [25/320], Loss: 0.5889
Epoch [10/15], Step [30/320], Loss: 0.7342
Epoch [10/15], Step [35/320], Loss: 0.7259
Epoch [10/15], S

Epoch [12/15], Step [210/320], Loss: 0.5191
Epoch [12/15], Step [215/320], Loss: 0.4575
Epoch [12/15], Step [220/320], Loss: 0.4872
Epoch [12/15], Step [225/320], Loss: 0.7335
Epoch [12/15], Step [230/320], Loss: 0.6446
Epoch [12/15], Step [235/320], Loss: 0.5123
Epoch [12/15], Step [240/320], Loss: 0.4746
Epoch [12/15], Step [245/320], Loss: 0.6015
Epoch [12/15], Step [250/320], Loss: 0.5376
Epoch [12/15], Step [255/320], Loss: 1.1392
Epoch [12/15], Step [260/320], Loss: 0.6377
Epoch [12/15], Step [265/320], Loss: 0.5765
Epoch [12/15], Step [270/320], Loss: 0.9862
Epoch [12/15], Step [275/320], Loss: 0.6219
Epoch [12/15], Step [280/320], Loss: 0.8073
Epoch [12/15], Step [285/320], Loss: 0.5094
Epoch [12/15], Step [290/320], Loss: 0.5147
Epoch [12/15], Step [295/320], Loss: 0.5080
Epoch [12/15], Step [300/320], Loss: 0.7012
Epoch [12/15], Step [305/320], Loss: 0.6342
Epoch [12/15], Step [310/320], Loss: 0.4576
Epoch [12/15], Step [315/320], Loss: 0.8888
Epoch [12/15], Step [320/320], L

Epoch [15/15], Step [160/320], Loss: 0.7214
Epoch [15/15], Step [165/320], Loss: 0.6914
Epoch [15/15], Step [170/320], Loss: 0.3198
Epoch [15/15], Step [175/320], Loss: 0.4859
Epoch [15/15], Step [180/320], Loss: 0.3833
Epoch [15/15], Step [185/320], Loss: 0.8095
Epoch [15/15], Step [190/320], Loss: 0.4003
Epoch [15/15], Step [195/320], Loss: 0.9343
Epoch [15/15], Step [200/320], Loss: 0.5295
Epoch [15/15], Step [205/320], Loss: 0.7488
Epoch [15/15], Step [210/320], Loss: 0.4471
Epoch [15/15], Step [215/320], Loss: 0.3824
Epoch [15/15], Step [220/320], Loss: 0.4146
Epoch [15/15], Step [225/320], Loss: 0.6697
Epoch [15/15], Step [230/320], Loss: 0.5784
Epoch [15/15], Step [235/320], Loss: 0.4413
Epoch [15/15], Step [240/320], Loss: 0.4035
Epoch [15/15], Step [245/320], Loss: 0.5332
Epoch [15/15], Step [250/320], Loss: 0.4650
Epoch [15/15], Step [255/320], Loss: 1.0796
Epoch [15/15], Step [260/320], Loss: 0.5722
Epoch [15/15], Step [265/320], Loss: 0.5092
Epoch [15/15], Step [270/320], L

-----------

## Test

In [242]:
enc = text_encoder("what's the differnc")
print enc
enc_pad = pad(np.array([enc]))
#print enc_pad
tensor_pad = Variable(torch.LongTensor(enc_pad))
#print tensor_pad
out = languagemodel(tensor_pad,get_lengths(np.array([enc])))
#int(output.cpu().data.topk(1,dim=2)[1].numpy()[0])
for i in range(out.size()[1]):
    t = out[:,i,:]
    _,predicted = torch.max(t.data,1)
    #print predicted
    print id_word[predicted.numpy()[0]]

[23, 8, 1, 20, 32, 19, 27, 20, 8, 5, 27, 4, 9, 6, 6, 5, 18, 14, 3, 44]
h
a
t
 
s
 
t
h
e
 
d
i
f
f
e
r
e
e
e
 


In [161]:
out.cpu().data.topk(1,dim=2)[1].numpy()[0]

array([[ 8],
       [ 1],
       [20],
       [27],
       [ 1]])

In [243]:
def generate_sequence(start_string, max_length):
    enc = text_encoder(start_string)
    enc_pad = pad(np.array([enc]))
    tensor_pad = Variable(torch.LongTensor(enc_pad))
    result = start_string
    
    while enc[0] != token_dict['<end>']:
        out = languagemodel(tensor_pad,get_lengths(np.array([enc])))
        enc = [int(out.cpu().data.topk(1,dim=2)[1].numpy()[0][-1])]
        result+=id_word[enc[0]]
        #print result
        if len(result)>max_length:
            break
        enc = text_encoder(result)
        enc_pad = pad(np.array([enc]))
        tensor_pad = Variable(torch.LongTensor(enc_pad))
    print result

In [248]:
generate_sequence("what's the difference between a jew", 50)

what's the difference between a jews epe eeee eeee 


In [165]:
input_encoded = []
for char in "start string":
    input_encoded.append(token_dict[char])

In [166]:
input_encoded

[19, 20, 1, 18, 20, 27, 19, 20, 18, 9, 14, 7]

In [170]:
enc

[23, 8, 1, 20, 44]