<a href="https://colab.research.google.com/github/ronenbendavid/IDC_NLP/blob/master/Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import sample
import random
from google.colab import drive
import json

In [0]:
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
def read_data(path):
    lines = open(path).read().strip().split('\n')
    texts = []
    for line in lines:
      try:
        tweet = json.loads(line)
        texts.append(tweet['text'].lower().strip())
      except:
        pass
    return texts
data = read_data('/content/drive/My Drive/Colab Notebooks/ML course/data/data.json')

In [0]:
print(random.choice(data))
print(len(data))

we took jesse ice skating today for christmas in manhattan.  we didn't intend to go for 4 hours but that's how long… https://t.co/oncfeixl3w
3723


In [0]:
EOS_TOKEN = 0
MAX_SEQ_LEN = 100

class Vocab:
    def __init__(self):
        self.char2id = {}
        self.id2char = {}
        self.n_chars = 1
        
    def index_sentence(self, sentence):
      indexes = [self.index_char(c) for c in sentence]
      indexes.append(EOS_TOKEN)
      return indexes
    
    def index_char(self, c):
        if c not in self.char2id:
            self.char2id[c] = self.n_chars
            self.id2char[self.n_chars] = c
            self.n_chars += 1
        return self.char2id[c]
            
            
def prepare_data(data):
    vocab = Vocab()
    data_sequences = []
    for sentence in data:
        if len(sentence) <= MAX_SEQ_LEN:
            indexes = vocab.index_sentence(sentence)
            data_sequences.append(indexes)
            
    return data_sequences, vocab

In [0]:
data_sequences, vocab = prepare_data(data)

In [0]:
class TextGen(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(TextGen, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, char_input, hidden):
        embedded = self.embedding(char_input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.out(output.view(1, -1))
        return output, hidden
    
    def init_hidden(self):
        return (torch.zeros(self.n_layers, 1, self.hidden_size).cuda(),
                torch.zeros(self.n_layers, 1, self.hidden_size).cuda())

In [0]:
hidden_size = 800
n_layers = 1

# Initialize the model
model = TextGen(vocab.n_chars, hidden_size, vocab.n_chars, n_layers).cuda()

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (ADAM is a fancy version of SGD)
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [0]:
n_epochs = 40
print_every = 100
loss = 0
for e in range(1, n_epochs + 1):
    
    # shuffle the data before starting a new epoch
    data_sequences_shuff = sample(data_sequences, len(data_sequences))
    
    for counter, sequence in enumerate(data_sequences_shuff):
      
      # zero gradients
      optimizer.zero_grad()
      
      # creating a tensor for the input sentence
      seq_len = len(sequence)
      sequence_tensor = torch.LongTensor(sequence).cuda()
      
      # the current sentence loss
      sentence_loss = 0
      
      # initialize the first hidden vector
      hidden = model.init_hidden()
      
      # looping through the network char by char
      for i in range(seq_len - 1):
        # forward
        output, hidden = model(sequence_tensor[i], hidden)
        # loss
        sentence_loss += criterion(output.view(-1).unsqueeze(0), sequence_tensor[i + 1].unsqueeze(0))
    
      # running backward
      sentence_loss.backward()
      
      # updating weights
      optimizer.step()
      
      # averaging total loss
      loss += (sentence_loss.item() / seq_len)
      
      if counter % print_every == 0:
          loss = loss / print_every
          print('Epoch %d, %d/%d, Current Loss = %.4f' % (e, counter, len(data_sequences_shuff), loss))
          loss = 0
    

Epoch 1, 0/2675, Current Loss = 0.0636
Epoch 1, 100/2675, Current Loss = 4.3451
Epoch 1, 200/2675, Current Loss = 3.1980
Epoch 1, 300/2675, Current Loss = 3.0600
Epoch 1, 400/2675, Current Loss = 2.8650
Epoch 1, 500/2675, Current Loss = 2.7675
Epoch 1, 600/2675, Current Loss = 2.6656
Epoch 1, 700/2675, Current Loss = 2.7410
Epoch 1, 800/2675, Current Loss = 2.6136
Epoch 1, 900/2675, Current Loss = 2.6831
Epoch 1, 1000/2675, Current Loss = 2.5316
Epoch 1, 1100/2675, Current Loss = 2.5372
Epoch 1, 1200/2675, Current Loss = 2.6855
Epoch 1, 1300/2675, Current Loss = 2.4685
Epoch 1, 1400/2675, Current Loss = 2.4446
Epoch 1, 1500/2675, Current Loss = 2.4199
Epoch 1, 1600/2675, Current Loss = 2.3755
Epoch 1, 1700/2675, Current Loss = 2.4921
Epoch 1, 1800/2675, Current Loss = 2.3934
Epoch 1, 1900/2675, Current Loss = 2.4263
Epoch 1, 2000/2675, Current Loss = 2.3745
Epoch 1, 2100/2675, Current Loss = 2.3722
Epoch 1, 2200/2675, Current Loss = 2.2691
Epoch 1, 2300/2675, Current Loss = 2.3342
Epoc

# Generation

In [0]:
def generate(model, vocab, start_string, temperature, max_len):
    '''
    This function gets a trained model and vocab and generates a random string
    using the model, seeded with the start_string string.
    The temparature value is used to generate a more diverse output (high value),
    then a conservative one (low value).
    '''
    str = start_string
    sequence = vocab.index_sentence(start_string)
        
    # remove the EOS, we don't need it for generation
    sequence = sequence[:-1]
    
    hidden = model.init_hidden()
    sequence_ten = torch.LongTensor(sequence).cuda()
    for i in range(len(sequence_ten) - 1):
      _, hidden = model(sequence_ten[i], hidden)
      
    output, hidden = model(sequence_ten[-1], hidden) 
    out_dist = output.view(-1).div(temperature).exp()
    new_c = vocab.id2char[torch.multinomial(out_dist, 1)[0].item()]
    str += new_c
    for i in range(max_len):
        new_c_var = torch.LongTensor([vocab.index_char(new_c)]).cuda()
        output, hidden = model(new_c_var, hidden)
        out_dist = output.view(-1).div(temperature).exp()
        char_id = torch.multinomial(out_dist, 1)[0].item()
        if char_id == EOS_TOKEN:
            return str
        new_c = vocab.id2char[char_id]
        str += new_c
    return str

In [0]:
for i in range(200):
    print(generate(model, vocab, 'I got ', 1, 200))

I got me a tan lil italian thang for acuration.  https://t.co/qzqhbvnnyq
I got his hair some efciouse not
I got a new urban only in the projest good people ling...ips. thange.
I got any of these https://t.co/n1vn6ue6jq
I got one new the new up son
I got a new jersey https://t.co/jnrx1klbrd
I got any of the day, ready to fuck just coppins in tradaduphed - @jodanceleri great, i’m literally hours and we’ve already park nobada
I got my about to out 😭
I got any of these https://t.co/uar35kmcjw
I got me a tan lil itmal. like im know😂
I got a new esnow forget https://t.co/oww9lxhxw7
I got hair stopped and go let’s hope it’s of my thing
I got an each on my more than this one of that.
I got me a feliz navidad y’all know
I got any of the dayfice.
I got an extra special patching!
I got any of these https://t.co/ri2e59jcxo
I got me a tan lil italainum. go have thema
I got you an the anightm
I got any of their wouldn't beautiful. it’s like today!! enjoy the day
I got a new urban x nonesh vida you p