In [3]:
import torch
import numpy as np
from torch import nn, autograd, optim
from torch.autograd import Variable
import torch.nn.functional as F
import random

In [4]:
def read_data():
    #lines = open('data/eng-heb.txt').read().strip().split('\n')
    lines = open('/Users/kfirbar/Documents/research/group47/tweets-ny.txt').read().strip().split('\n')
    english_lines = [l.split('\t')[0].lower().strip() for l in lines]
    return english_lines

data = read_data()

In [5]:
print random.choice(data)

i’m opening up a store for all vintage cozy &amp; designer clothing.


In [6]:
EOS_TOKEN = 0
MAX_SEQ_LEN = 40

class Lang:
    def __init__(self):
        self.char2id = {}
        self.id2char = {}
        self.char2count = {}
        self.n_chars = 1
        
        
    def index_sentence(self, sentence):
        for c in sentence:
            self.index_char(c)
        
    
    def index_char(self, c):
        if c not in self.char2id:
            self.char2id[c] = self.n_chars
            self.char2count[c] = 1
            self.id2char[self.n_chars] = c
            self.n_chars += 1
        else:
            self.char2count[c] += 1
            
            
def prepare_data(data):
    lang = Lang()
    for sentence in data:
        if len(sentence) <= MAX_SEQ_LEN:
            lang.index_sentence(sentence)
    return lang

In [7]:
lang = prepare_data(data)

In [8]:
def sentence2variable(sentence):
    indexes = [lang.char2id[c] for c in sentence]
    input_var = Variable(torch.LongTensor(indexes).view(-1, 1))
    indexes.append(EOS_TOKEN)
    target_var = Variable(torch.LongTensor(indexes[1:]).view(-1, 1))
    return input_var, target_var

def data2variables(data):
    variables = [sentence2variable(s) for s in data if len(s) <= MAX_SEQ_LEN]
    return variables

data_variables = data2variables(data)

In [9]:
class TextGen(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(TextGen, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, char_input, hidden):
        seq_len = len(char_input)
        embedded = self.embedding(char_input).view(seq_len, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.out(output.view(1, -1))
        return output, hidden
    
    def init_hidden(self):
        return (Variable(torch.zeros(self.n_layers, 1, self.hidden_size)),
                Variable(torch.zeros(self.n_layers, 1, self.hidden_size)))

In [10]:
hidden_size = 800
n_layers = 6

model = TextGen(lang.n_chars, hidden_size, lang.n_chars, 1)

criterion = nn.CrossEntropyLoss()
learning_rate = 0.0001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [11]:
def train_seq(model, optimizer, criterion, input_var, target_var):
    optimizer.zero_grad()
    seq_len = len(input_var.data)
    loss = 0
    hidden = model.init_hidden()
    for o in range(seq_len):
        output, hidden = model(input_var[o], hidden)
        loss += criterion(output.view(-1).unsqueeze(0), target_var[o])
    
    loss.backward()
    torch.nn.utils.clip_grad_norm(model.parameters(), 5.0)
    optimizer.step()
    return loss.data[0] / seq_len

In [None]:
n_epochs = 20000
print_every = 100
loss = 0
for e in range(1, n_epochs + 1):
    pair = random.choice(data_variables)
    input_var = pair[0]
    target_var = pair[1]
    loss += train_seq(model, optimizer, criterion, input_var, target_var)
    
    if e % print_every == 0:
        loss = loss / print_every
        print 'Epoch %d Current Loss = %.4f' % (e, loss)
        loss = 0
    

Epoch 100 Current Loss = 3.9693
Epoch 200 Current Loss = 3.1655
Epoch 300 Current Loss = 2.9400
Epoch 400 Current Loss = 2.7679
Epoch 500 Current Loss = 2.6691
Epoch 600 Current Loss = 2.5352
Epoch 700 Current Loss = 2.6078
Epoch 800 Current Loss = 2.4188
Epoch 900 Current Loss = 2.5055
Epoch 1000 Current Loss = 2.3548
Epoch 1100 Current Loss = 2.2732
Epoch 1200 Current Loss = 2.1008
Epoch 1300 Current Loss = 2.2345
Epoch 1400 Current Loss = 2.2274
Epoch 1500 Current Loss = 2.3534
Epoch 1600 Current Loss = 2.2043
Epoch 1700 Current Loss = 2.2169


In [22]:
def generate(model, start_string, temperature, max_len):
    hidden = model.init_hidden()
    start_var,_ = sentence2variable(start_string)
    for i in range(len(start_string) - 1):
        _, hidden = model(start_var[i], hidden)
    
    str = start_string
    out, hidden = model(start_var[-1], hidden)
    out_dist = out.data.view(-1).div(temperature).exp()
    new_c = lang.id2char[torch.multinomial(out_dist, 1)[0]]
    str += new_c
    for i in range(max_len):
        new_c_var, _ = sentence2variable(new_c)
        out, hidden = model(new_c_var, hidden)
        out_dist = out.data.view(-1).div(temperature).exp()
        char_id = torch.multinomial(out_dist, 1)[0]
        if char_id == EOS_TOKEN:
            return str
        new_c = lang.id2char[char_id]
        str += new_c
    return str

In [23]:
for i in range(10):
    print generate(model, 'i hate ', 0.01, 50)

i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk
i hate lkkkkkkk


In [56]:
for i in range(200):
    print generate(model, 'i am at ', 0.8, 200)

i am at life.
i am at lights.
i am at will got and marking name
i am at the strip
i am at placom
i am at sich musnc
i am at lifgeting assours air this is a my shuts
i am at is mach y a get to me
i am at stunting
i am at will get to me
i am at dou this night prns
i am at clasing serving
i am at urs get on is america
i am at sunch 57
i am at will arrears
i am at is happening!
i am at live to mether.
i am at raing need on
i am at sich you to tere!!
i am at to got https://t.co/si4lizmnrf
i am at like tapl
i am at usget #897
i am at still
i am at ungels
i am at iscims souck a dj. #2017in4words
i am at suchos
i am at light
i am at life.
i am at using to you too https://t.co/dtpuajpzn
i am at life.
i am at such able
i am at clasic so leadust
i am at ussed but
i am at s lightt!!!!
i am at sich . #prns
i am at ust a be #prnc
i am at lighttra
i am at swith have take.
i am at luck https://t.co/az5vmnmfgl
i am at uch my get
i am at shit that
i am at is me pacito
i am at still dousing everyone is f