In [150]:
import json
import glob
import re
import os
import time
import math

import torch
import torch.nn as nn
from torch.utils import data

### Getting the data

Twitter's API has a limit of 3200 tweets.

https://github.com/bpb27/trump_tweet_data_archive/archive/master.zip

### TODO: 

- Auto split into train/test/valid set. https://gist.github.com/Fuchai/12f2321e6c8fa53058f5eb23aeddb6ab
- Glob over zipped file and do the decompression.
- Read over all json files.
- Validate and check if futher pre-processing for tweets is needed for better results.

In [177]:
max_len = 0 # max length of tweets
tokens = []
tweets_count = 0 # count of tweets
output_file = 'clean.txt'
seq_len = 15 # this is the average length of all tweets
PAD_TOKEN = '<PAD>'
SOS_TOKEN = '<SOS>'
EOS_TOKEN = '<EOS>'

batch_size = 100
eval_batch_size = 20
bptt = 30

embed_size = 10
hidden_size = 10
num_layers = 2
clips = 0.25 #? what's a good one?
log_interval = 20
num_epoches = 10
dropout = 0.2
lr = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data processing

In [168]:
max_len = 0
total_size = 0

clean_file = 'clean.txt'
full_dataset = []

file_list = glob.glob('./data/*.json')
with open(clean_file, 'w', encoding='utf8') as out_file:
    for file in file_list:
        with open(file, 'r', encoding='utf8') as in_file:
            f_json = json.load(in_file)
            for i, resp in enumerate(f_json):
                if not resp['is_retweet']:
                    line = re.sub('(http|www|@|#)\S+', '', resp['text'])
                    line = re.sub('[^\w\s]', '', line)
                    total_size += 1
                    out_file.write(line.lower() + "\n")
                    full_dataset.append(line.lower())

print('total_size', total_size)

train_size = int(0.9 * total_size)
val_size = int(0.05 * total_size)
test_size = total_size - train_size - val_size

train_set, val_set, test_set = torch.utils.data.random_split(full_dataset, [train_size, test_size, val_size])

print('train size: {}, val size: {}, test size: {}'.format(len(train_set), len(val_set), len(test_set)))

total_size 32886
train size: 29597, val size: 1645, test size: 1644


# Data model

In [171]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)


class Corpus(object):
    def __init__(self, train_set, val_set, test_set):
        self.dictionary = Dictionary()
        self.train_set = self.tokenize(train_set)
        self.valid_set = self.tokenize(val_set)
        self.test_set = self.tokenize(test_set)

    def tokenize(self, dataset):
        token_idx = 0
        for line in dataset:
            tokens = line.split() + [EOS_TOKEN]
            token_idx += len(tokens)
            for token in tokens:
                self.dictionary.add_word(token)

        ids = torch.LongTensor(token_idx)
        token_idx = 0
        for line in dataset:
            tokens = line.split() + [EOS_TOKEN]
            for token in tokens:
                ids[token_idx] = self.dictionary.word2idx[token]
                token_idx += 1
        return ids

In [176]:
corpus = Corpus(train_set, val_set, test_set)
print('train size', corpus.train_set.size())

def divide_to_batch(data, batch_size):
    num_batch = data.size(0) // batch_size
    data = data.narrow(0, 0, num_batch * batch_size)
    data = data.view(batch_size, -1).t().contiguous()
    return data.to(device)

train_data = divide_to_batch(corpus.train_set, batch_size)
print('train data size', train_data.size())
val_data = divide_to_batch(corpus.valid_set, eval_batch_size)
test_data = divide_to_batch(corpus.test_set, eval_batch_size)

num_tokens = len(corpus.dictionary)


def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

train size torch.Size([479429])
train data size torch.Size([4794, 100])


In [127]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, 
                 num_layers, dropout):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.init_weights()
    
    def init_weights(self):
        self.embed.weight.data.uniform_(-0.1, 0.1)
        self.linear.bias.data.zero_()
        self.linear.weight.data.uniform_(-0.1, 0.1)

    # ???
    def forward(self, x, hidden):
        embed = self.embed(x)
        x = self.dropout(embed)
        out, hidden = self.lstm(x, hidden)
        out = self.dropout(out)
        decoded = self.linear(out.view(out.size(0)*out.size(1), out.size(2)))
        return decoded.view(out.size(0), out.size(1), decoded.size(1)), hidden
    
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()) #???
        return (weight.new_zeros(self.num_layers, batch_size, self.hidden_size),
                weight.new_zeros(self.num_layers, batch_size, self.hidden_size))

In [144]:
vocab_size = len(corpus.dictionary)


model = RNN(vocab_size, embed_size, hidden_size, 
                  num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
            hidden = repackage_hidden(hidden)
    return total_loss / len(data_source)

def train():
    model.train()
    total_loss = 0
    start_time = time.time()
    hidden = model.init_hidden(batch_size)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        hidden = repackage_hidden(hidden)
        model.zero_grad()
        out, hidden = model(data, hidden)
        loss = criterion(out.view(-1, vocab_size), targets)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clips)
        optimizer.step()
        
        #for p in model.parameters():
        #   p.data.add_(-lr, p.grad.data)
        total_loss += loss.item()
        if True:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // bptt, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

best_val_loss = None
            
for epoch in range(1, num_epoches+1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
            'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                       val_loss, math.exp(val_loss)))
    
    if not best_val_loss or val_loss < best_val_loss:
        with open('model.pt', 'wb') as f:
            torch.save(model, f)
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 4.0
        
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

| epoch   1 |     0/    1 batches | lr 20.00 | ms/batch  0.93 | loss  0.30 | ppl     1.35
| epoch   1 |     1/    1 batches | lr 20.00 | ms/batch  0.49 | loss  0.30 | ppl     1.35
-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.04s | valid loss  5.86 | valid ppl   351.62
| epoch   2 |     0/    1 batches | lr 20.00 | ms/batch  0.94 | loss  0.30 | ppl     1.35
| epoch   2 |     1/    1 batches | lr 20.00 | ms/batch  0.48 | loss  0.30 | ppl     1.35
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.05s | valid loss  5.86 | valid ppl   350.60
| epoch   3 |     0/    1 batches | lr 20.00 | ms/batch  1.01 | loss  0.30 | ppl     1.34
| epoch   3 |     1/    1 batches | lr 20.00 | ms/batch  0.54 | loss  0.30 | ppl     1.34
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.05s | valid 

  "type " + obj.__name__ + ". It won't be checked "


| epoch   5 |     0/    1 batches | lr 20.00 | ms/batch  0.92 | loss  0.30 | ppl     1.34
| epoch   5 |     1/    1 batches | lr 20.00 | ms/batch  0.52 | loss  0.30 | ppl     1.34
-----------------------------------------------------------------------------------------
| end of epoch   5 | time:  0.04s | valid loss  5.85 | valid ppl   347.38
| epoch   6 |     0/    1 batches | lr 20.00 | ms/batch  1.13 | loss  0.30 | ppl     1.34
| epoch   6 |     1/    1 batches | lr 20.00 | ms/batch  0.66 | loss  0.30 | ppl     1.34
-----------------------------------------------------------------------------------------
| end of epoch   6 | time:  0.05s | valid loss  5.85 | valid ppl   346.22
| epoch   7 |     0/    1 batches | lr 20.00 | ms/batch  1.32 | loss  0.30 | ppl     1.34
| epoch   7 |     1/    1 batches | lr 20.00 | ms/batch  0.48 | loss  0.30 | ppl     1.34
-----------------------------------------------------------------------------------------
| end of epoch   7 | time:  0.05s | valid 

In [145]:
temperature = 1.0
num_words = 100

with open('model.pt', 'rb') as f:
    model = torch.load(f).to(device)
model.eval()

# corpus = data.Corpus(args.data)
num_tokens = len(corpus.dictionary)
hidden = model.init_hidden(1)
input = torch.randint(num_tokens, (1, 1), dtype=torch.long).to(device)

with open('output.txt', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(num_words):
            output, hidden = model(input, hidden)
            word_weights = output.squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            input.fill_(word_idx)
            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if (i + 1) % log_interval == 0:
                print('| Generated {}/{} words'.format(i+1, num_words))

| Generated 20/100 words
| Generated 40/100 words
| Generated 60/100 words
| Generated 80/100 words
| Generated 100/100 words
