In [1]:
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import seaborn as sns
import datetime

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

In [3]:
ds = pd.read_csv('/home/exoper/Documents/data/trump_tw.csv')
ds.text.head()

  interactivity=interactivity, compiler=compiler, result=result)


0    Read a great interview with Donald Trump that ...
1    Congratulations to Evan Lysacek for being nomi...
2    I was on The View this morning. We talked abou...
3    Tomorrow night's episode of The Apprentice del...
4    Donald Trump Partners with TV1 on New Reality ...
Name: text, dtype: object

In [5]:
# Take all the text together

data = ' '.join([ix for ix in ds.text])
print (data[:1000])

Read a great interview with Donald Trump that appeared in The New York Times Magazine: http://tinyurl.com/qsx4o6 Congratulations to Evan Lysacek for being nominated SI sportsman of the year. He's a great guy, and he has my vote!  #EvanForSI I was on The View this morning. We talked about The Apprentice. Tonight's episode is a great one--tough, exciting and surprising. 10 pm/NBC Tomorrow night's episode of The Apprentice delivers excitement at QVC along with appearances by Isaac Mizrahi and Cathie Black. 10 pm on NBC Donald Trump Partners with TV1 on New Reality Series Entitled, Omarosa's Ultimate Merger: http://tinyurl.com/yk5m3lc I'll be appearing on Larry King Live for his final show, Thursday night at 9 p.m., CNN. Larry's been on TV for 25 years... I'll be on The Late Show with David Letterman tonight--be sure to tune in for a great show. 11:30 pm on CBS. Watch the Miss Universe competition LIVE from the Bahamas - Sunday, 8/23 @ 9pm (ET) on NBC: http://tinyurl.com/mrzad9 Watch video

In [7]:
print (set(data))
print (len(set(data)))

{'😜', '\\', '❌', 'ラ', 'ğ', '😄', 'ざ', '👗', '😉', '💙', '😱', '🙌', '😍', 'y', 'd', '歴', 'て', 'R', 'á', '\u200e', 'L', '-', 'J', '絆', 'ד', 'g', 'c', '⛳', '📈', '_', 'プ', '5', 'T', '😊', '💕', '📸', '»', 'ï', '☑', '👢', '🇰', '\r', '💃', '2', 'e', '💨', '🇨', 'n', '🌺', '✅', '間', '🇳', '🇲', 'と', '。', '🤖', '😎', '\U0010fc00', '統', 'B', '📷', 'p', 'る', '訪', '😰', 'Y', 'ン', 'w', '€', '💗', '😣', 'k', 'A', '´', 'C', '🌹', '`', '🏢', '🦃', 'D', '«', '揺', '☉', '😂', '盟', 'Ｒ', '的', 'ナ', '💤', 'ז', '☀', 'ま', '●', '💁', 'ご', '🇦', '🇴', 'ĺ', '🍻', '{', '.', '👆', '界', '"', '®', '*', '[', '👈', '🇺', '👎', '😔', 'o', '😳', 'x', '🚂', '👑', '、', '🇵', 'ジ', 'ú', '…', 'い', '示', 'U', '米', '8', '7', 'ō', '🍷', '⚡', '💞', '☝', 'O', '🌚', '😢', 'P', 'h', '1', '領', '🚨', '📉', 'r', '🎧', '🎈', '‼', '☆', '🌍', '功', '%', '🇮', '👋', '🏆', '&', 'é', '@', '\u200b', ':', '👔', '🎾', 'を', '同', 'è', '日', 'i', '$', '⚾', '✌', '😆', 'お', '👀', '😴', 'が', '😡', 'É', 'l', '➡', '成', ';', '違', '😓', 'ø', 'í', '‘', '🙅', 'ם', '#', '🎥', '😑', '}', '❤', '🇱', '💋', '🏼', '👌', '大', ']'

In [15]:
# Create Vocab
vocab = list(set(data))

i2c, c2i = {}, {}
#print(len(vocab))
for idx, chx in enumerate(vocab):
    i2c[idx] = chx
    c2i[chx] = idx


In [18]:
def get_onehot(x):
    # Take input a string and convert to one-hot encoding
    vec_size = len(c2i.keys())
    n_seq = len(x)
    data = np.zeros((1, n_seq, vec_size))
    
    # For each element in the list
    for ix in range(n_seq):
        curr_char = x[ix]
        oh_index = c2i[curr_char]
        # print ix, curr_char, oh_index
        data[:, ix, oh_index] = 1
    return data

print (get_onehot('this is my string').shape)

(1, 17, 355)


In [19]:
for ix in ds.text[:10]:
    print (get_onehot(ix).shape)

(1, 112, 355)
(1, 127, 355)
(1, 139, 355)
(1, 140, 355)
(1, 116, 355)
(1, 122, 355)
(1, 108, 355)
(1, 117, 355)
(1, 115, 355)
(1, 102, 355)


In [20]:
class CharNN(nn.Module):
    def __init__(self, in_shape=None, out_shape=None, hidden_shape=None):
        super(CharNN, self).__init__()
        self.in_shape = in_shape
        self.out_shape = out_shape
        self.hidden_shape = hidden_shape
        self.n_layers = 1
        
        self.rnn = nn.LSTM(
            input_size=self.in_shape,
            hidden_size=self.hidden_shape,
            num_layers=self.n_layers,
            batch_first=True
        )
        self.out = nn.Linear(self.hidden_shape, self.out_shape)
    
    def forward(self, x, h):
        r_out, h_state = self.rnn(x, h)
        
        outs = []
        for ix in range(r_out.size(1)):
            current_out = F.softmax(self.out(r_out[:, ix, :]))
            outs.append(current_out)
        return torch.stack(outs, dim=1), h_state
    
    def predict(self, char, h=None, top_k=None):
        if h is None:
            h = self.init_hidden(1, gpu=False)
        
        x = get_onehot(char)
        out, h = self.forward(torch.FloatTensor(x), h)
        
        p = out.data
        if top_k is None:
            top_ch = np.arange(self.out_shape)
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())
        return i2c[char], h
    
    def init_hidden(self, batch_size, gpu=False):
        if gpu:
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape).cuda()),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)).cuda())
        return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)),
                Variable(torch.zeros(self.n_layers, batch_size, self.hidden_shape)))

In [21]:
model = CharNN(in_shape=174, out_shape=174, hidden_shape=256)
model.cuda()
print (model)

    Found GPU0 GeForce 940MX which is of cuda capability 5.0.
    PyTorch no longer supports this GPU because it is too old.
    


CharNN(
  (rnn): LSTM(174, 256, batch_first=True)
  (out): Linear(in_features=256, out_features=174, bias=True)
)


In [22]:
# Load the weights
model.load_state_dict(torch.load('/home/shubham/all_projects/CB/Summer_2018/data/checkpoints/text_gen/model_256h_epoch_38.ckpt'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/shubham/all_projects/CB/Summer_2018/data/checkpoints/text_gen/model_256h_epoch_38.ckpt'

In [14]:
# model.predict('a', top_k=20)[0]

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [26]:
# Set to train mode
# model.cuda()
model.train()
N = 5000

for epoch in range(50):
    total_loss = 0
    # For each sequence
    for qx in range(N):
        seqx = ds.text[qx]
        h_state = model.init_hidden(1)
        input_seq = seqx[:-1]
        target_seq = seqx[1:]
        
        x = Variable(torch.FloatTensor(get_onehot(input_seq)), requires_grad=True)# .cuda()
        y = Variable(torch.LongTensor(get_onehot(target_seq).argmax(2)))# .cuda()
        
        model.zero_grad()
        pred, h_state = model.forward(x, h_state)
        # print pred.squeeze().shape, y.shape
        loss = criterion(pred.squeeze(), y.squeeze())
        
        # optimizer.zero_grad()
        loss.backward()
        
        # gradient clipping to solve exploding/vanishing grads
        # clip = 5.0
        # nn.utils.clip_grad_norm(net.parameters(), clip)
        
        optimizer.step()
        total_loss += loss
        if qx%(N/5) == 0:
            print ('Loss: {} at Epoch: {} | Seq: {}'.format(loss, epoch, qx))
        
    print( "Overall Average Loss: {} at Epoch: {}".format(total_loss / float(N), epoch))
    
    # Save model checkpoints
    if epoch % 10 == 0:
        torch.save(model.state_dict(), "./data/checkpoints/text_gen/model_256h_epoch_{}.ckpt".format(epoch))

NameError: name 'fn' is not defined

In [37]:
sentence = 'o'
model.cpu()
h_s = model.init_hidden(1, gpu=False)
for ix in range(1000):
    ctx = sentence[-1]
    out, h = model.predict(ctx, h=h_s, top_k=100)
    h_s = (h[0].data, h[1].data)
    
    sentence += out
print sentence



or @Markettorial @Markettoristing @Marketanders @Markettorian and an amarion and an amarian and an amas and an amarinate and an amasting and an amarian and an amater and an amaring and an amas and an amarian and an amarical and an amas and an amasting and an amarinate and an amaster and an amasting and an amarian and an amappathing and an amappathing and an amarian and an amasting and an amapart and an amaring and an amasting and an amarian and an amas and an amasting and an amasping and and an amarian and an amas and an amasting and an amarinate and an amasting and an amarian and an amasting and an amarical and an amas and an amasting and an amasting and an amarian and an amarical and an amasting and an amasting and an amas and an amas and an amasting and an amarian and an amas and an amasping and and an amaring and an amasting and an amasting and an amasting and an amasting and an amarian and an amas and an amas and an amasting and an amas and an amas and an amasting and an amasting 