In [1]:
import numpy as np
import pandas as pd

import torch
import torchtext

import tensorflow as tf

import spacy
nlp = spacy.load('en_core_web_sm')

from torchtext.vocab import build_vocab_from_iterator
import torchtext.transforms as T

from torch.utils.data import DataLoader

2024-09-21 19:57:39.732252: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('/Users/nazarlenisin/Desktop/Text Generation/Tweets.csv')
df = df['text']
df.head(3)

0               I`d have responded, if I were going
1     Sooo SAD I will miss you here in San Diego!!!
2                         my boss is bullying me...
Name: text, dtype: object

In [3]:
data = ' '.join(df.values.astype(str))
data = [token.lower() for token in data]

In [4]:
class Tokenize_Text:
  def __init__(self,nlp):
    self.nlp = nlp

  def _yield_tokens(self,text):
    for token in text:
      yield token

  def _vocab(self,text):
    vocab = build_vocab_from_iterator(
        self._yield_tokens(text),
        specials = ['<unk>']
    )

    vocab.set_default_index(vocab['<unk>'])
    self.vocab = vocab

    return vocab

  def tokenize(self,text,vocab = None):
    tokens = [token for token in text]

    if vocab == None:
      vocab = self._vocab(text)

    transforms = T.Sequential(
        T.VocabTransform(vocab),
        T.ToTensor()
    )

    tokenized_text = transforms(tokens)

    return tokenized_text

In [5]:
def windows(data,window_size):
  windows = []

  index = 0
  window_size += 1

  while index + window_size <= len(data) - 1:
    windows.append(data[index: index +window_size])

    index += 1

  return np.array(windows)

In [6]:
class Generative_lstm(torch.nn.Module):
  def __init__(self,sequence_length,hidden_size,num_layers,vocab_size):
    super().__init__()
    self.sequence_length = sequence_length
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.vocab_size  = vocab_size

    self.lstm = torch.nn.LSTM(vocab_size,hidden_size,num_layers,batch_first = True)
    self.flatten = torch.nn.Flatten()
    self.linear = torch.nn.Linear(hidden_size,vocab_size)

  def forward(self,X):
    X = torch.nn.functional.one_hot(X,num_classes = self.vocab_size)
    X = X.to(torch.float32)
    X,_ = self.lstm(X)
    X = self.linear(X)
    
    return X

In [7]:
class Train_Model:
  def __init__(self,model,loss_function,optimizer,epochs = 5):
    self.model = model
    self.loss_function = loss_function
    self.optimizer = optimizer
    self.epochs = epochs

  def fit(self,train_data_batched):
    from tqdm.auto import tqdm
    from sklearn.metrics import accuracy_score as accuracy

    model = self.model
    model.train()

    train_batch_loss = 0
    train_batch_acc = 0

    for epoch in tqdm(range(self.epochs)):
      print(f'\n\nEpoch: {epoch}')

      for batch,(X,y) in tqdm(enumerate(train_data_batched)):
        if batch % 100 == 0: print(f'\nBatch {batch} / {len(train_data_batched)}')

        train_prediction = model(X)
        train_labels = train_prediction.argmax(-1)

        train_loss = self.loss_function(train_prediction.permute(0,2,1),y)
        train_acc = accuracy(y.flatten(),train_labels.flatten())

        train_batch_loss += train_loss
        train_batch_acc += train_acc
        if batch % 100 == 0: print(f'Train Loss: {train_loss} | Train Accuracy: {train_acc}')

        self.optimizer.zero_grad()
        train_loss.backward()
        self.optimizer.step()

        if batch == 1875: break


      train_batch_loss /= 1875
      train_batch_acc /= 1875

      print(f'Epoch: {epoch} | Train Loss: {train_batch_loss} | Train Accuracy: {train_batch_acc}')
      
    return model

  def eval(self,test_data_batched):
    from tqdm.auto import tqdm
    from sklearn.metrics import accuracy_score as accuracy

    model = self.model
    model.eval

    test_batch_loss = 0
    test_batch_acc = 0

    for batch,(X,y) in tqdm(enumerate(test_data_batched)):
      if batch % 10 == 0: print(f'\nBatch {batch} / {len(test_data_batched)}')

      test_prediction = model(X)
      test_labels = test_prediction.argmax(-1)

      test_loss = self.loss_function(test_prediction.permute(0,2,1),y)
      test_acc = accuracy(y.flatten(),test_labels.flatten())
      if batch % 100 == 0: print(f'Test Loss: {test_loss} | Test Accuracy: {test_acc}')

      test_batch_loss += test_loss
      test_batch_acc += test_acc

      if batch == 1875: break


    test_batch_loss /= 1875
    test_batch_acc /= 1875

    print(f'\nTest Loss: {test_batch_loss} | Test Accuracy: {test_batch_acc}')

In [8]:
def generate_text(model,vocab,inverse_vocab, tokenizer, seed_text, num_chars=200, temperature= 0.001):

  text = seed_text  

  for _ in range(num_chars):
    input = np.array(tokenizer.tokenize(text[-101:],vocab))

    preds = model(torch.tensor(input).unsqueeze(0))[0, -1:, :]
    preds = tf.constant(torch.softmax(preds,-1).detach().numpy())
    
    preds = tf.math.log(preds) / temperature
    
    next_char = tf.random.categorical(preds, num_samples=1)
    next_char = inverse_vocab[next_char.numpy()[0][0]]
    

    text += next_char

  return text
    

In [9]:
Tokenizer = Tokenize_Text(nlp)

tokenized_data = np.array(Tokenizer.tokenize(data))

vocab = Tokenizer.vocab
inverse_vocab = {value:key for key,value in vocab.get_stoi().items()}


In [10]:
sliced_data = windows(tokenized_data,101)

X = sliced_data[:,:-1]
y = sliced_data[:,1:]

' '.join([inverse_vocab[token] for token in X[0][:10]]),' '.join([inverse_vocab[token] for token in y[0][:10]])

('  i ` d   h a v e  ', 'i ` d   h a v e   r')

In [11]:
batch_size = 32
batched_data = DataLoader(list(zip(X,y)),batch_size = batch_size)

x,y = next(iter(batched_data))

x.shape,y.shape

(torch.Size([32, 101]), torch.Size([32, 101]))

In [12]:
sequence_length = len(X[0])
hidden_size = 128
num_layers = 2
vocab_size = len(vocab.get_stoi())

model = Generative_lstm(sequence_length,hidden_size,num_layers,vocab_size)

In [22]:
lr = 0.001

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr = lr)
epochs = 5

Trainer = Train_Model(model,loss_function,optimizer,epochs = epochs)

model = Trainer.fit(batched_data)

  0%|          | 0/5 [00:00<?, ?it/s]



Epoch: 0


0it [00:00, ?it/s]


Batch 0 / 59535
Train Loss: 1.208869218826294 | Train Accuracy: 0.6367574257425742

Batch 100 / 59535
Train Loss: 0.8957428932189941 | Train Accuracy: 0.7283415841584159

Batch 200 / 59535
Train Loss: 0.7051419615745544 | Train Accuracy: 0.786819306930693

Batch 300 / 59535
Train Loss: 0.7147476077079773 | Train Accuracy: 0.8115717821782178

Batch 400 / 59535
Train Loss: 0.6015011072158813 | Train Accuracy: 0.8301361386138614

Batch 500 / 59535
Train Loss: 0.6922172904014587 | Train Accuracy: 0.7933168316831684

Batch 600 / 59535
Train Loss: 0.6821919679641724 | Train Accuracy: 0.7920792079207921

Batch 700 / 59535
Train Loss: 0.7874571681022644 | Train Accuracy: 0.8050742574257426

Batch 800 / 59535
Train Loss: 0.8138601183891296 | Train Accuracy: 0.7057549504950495

Batch 900 / 59535
Train Loss: 0.5656724572181702 | Train Accuracy: 0.8418935643564357

Batch 1000 / 59535
Train Loss: 0.7411655187606812 | Train Accuracy: 0.8143564356435643

Batch 1100 / 59535
Train Loss: 0.555171608924

0it [00:00, ?it/s]


Batch 0 / 59535
Train Loss: 1.1865246295928955 | Train Accuracy: 0.6466584158415841

Batch 100 / 59535
Train Loss: 0.7658790349960327 | Train Accuracy: 0.7753712871287128

Batch 200 / 59535
Train Loss: 0.6792097091674805 | Train Accuracy: 0.7722772277227723

Batch 300 / 59535
Train Loss: 0.6068872213363647 | Train Accuracy: 0.84375

Batch 400 / 59535
Train Loss: 0.5874357223510742 | Train Accuracy: 0.8087871287128713

Batch 500 / 59535
Train Loss: 0.7280972003936768 | Train Accuracy: 0.791769801980198

Batch 600 / 59535
Train Loss: 0.7066624760627747 | Train Accuracy: 0.7663985148514851

Batch 700 / 59535
Train Loss: 0.7833223938941956 | Train Accuracy: 0.7626856435643564

Batch 800 / 59535
Train Loss: 0.8557454943656921 | Train Accuracy: 0.7212252475247525

Batch 900 / 59535
Train Loss: 0.5604797005653381 | Train Accuracy: 0.8313737623762376

Batch 1000 / 59535
Train Loss: 0.75356125831604 | Train Accuracy: 0.7685643564356436

Batch 1100 / 59535
Train Loss: 0.5864979028701782 | Train

0it [00:00, ?it/s]


Batch 0 / 59535
Train Loss: 1.1141180992126465 | Train Accuracy: 0.6751237623762376

Batch 100 / 59535
Train Loss: 0.7358376383781433 | Train Accuracy: 0.7892945544554455

Batch 200 / 59535
Train Loss: 0.6502729654312134 | Train Accuracy: 0.7645420792079208

Batch 300 / 59535
Train Loss: 0.6102358102798462 | Train Accuracy: 0.8248762376237624

Batch 400 / 59535
Train Loss: 0.6237249970436096 | Train Accuracy: 0.8155940594059405

Batch 500 / 59535
Train Loss: 0.6635831594467163 | Train Accuracy: 0.8081683168316832

Batch 600 / 59535
Train Loss: 0.5957456827163696 | Train Accuracy: 0.8270420792079208

Batch 700 / 59535
Train Loss: 0.6948043704032898 | Train Accuracy: 0.8449876237623762

Batch 800 / 59535
Train Loss: 0.7559986114501953 | Train Accuracy: 0.7663985148514851

Batch 900 / 59535
Train Loss: 0.5919710993766785 | Train Accuracy: 0.8270420792079208

Batch 1000 / 59535
Train Loss: 0.7202109694480896 | Train Accuracy: 0.7725866336633663

Batch 1100 / 59535
Train Loss: 0.5895297527

0it [00:00, ?it/s]


Batch 0 / 59535
Train Loss: 1.1793403625488281 | Train Accuracy: 0.6584158415841584

Batch 100 / 59535
Train Loss: 0.8160816431045532 | Train Accuracy: 0.786200495049505

Batch 200 / 59535
Train Loss: 0.6410121321678162 | Train Accuracy: 0.8208539603960396

Batch 300 / 59535
Train Loss: 0.6469395756721497 | Train Accuracy: 0.8224009900990099

Batch 400 / 59535
Train Loss: 0.527975857257843 | Train Accuracy: 0.8412747524752475

Batch 500 / 59535
Train Loss: 0.6775121688842773 | Train Accuracy: 0.7663985148514851

Batch 600 / 59535
Train Loss: 0.5775192379951477 | Train Accuracy: 0.7985767326732673

Batch 700 / 59535
Train Loss: 0.7206678986549377 | Train Accuracy: 0.8081683168316832

Batch 800 / 59535
Train Loss: 0.7010076642036438 | Train Accuracy: 0.7540222772277227

Batch 900 / 59535
Train Loss: 0.5035039186477661 | Train Accuracy: 0.880569306930693

Batch 1000 / 59535
Train Loss: 0.6946307420730591 | Train Accuracy: 0.7790841584158416

Batch 1100 / 59535
Train Loss: 0.5773632526397

0it [00:00, ?it/s]


Batch 0 / 59535
Train Loss: 1.0345689058303833 | Train Accuracy: 0.6714108910891089

Batch 100 / 59535
Train Loss: 0.766114354133606 | Train Accuracy: 0.7818688118811881

Batch 200 / 59535
Train Loss: 0.6211530566215515 | Train Accuracy: 0.8360148514851485

Batch 300 / 59535
Train Loss: 0.5934463739395142 | Train Accuracy: 0.8295173267326733

Batch 400 / 59535
Train Loss: 0.5522067546844482 | Train Accuracy: 0.8431311881188119

Batch 500 / 59535
Train Loss: 0.7152782678604126 | Train Accuracy: 0.7512376237623762

Batch 600 / 59535
Train Loss: 0.5462782979011536 | Train Accuracy: 0.8159034653465347

Batch 700 / 59535
Train Loss: 0.653044581413269 | Train Accuracy: 0.8329207920792079

Batch 800 / 59535
Train Loss: 0.7442502975463867 | Train Accuracy: 0.786819306930693

Batch 900 / 59535
Train Loss: 0.5709152221679688 | Train Accuracy: 0.8369430693069307

Batch 1000 / 59535
Train Loss: 0.6724869608879089 | Train Accuracy: 0.8214727722772277

Batch 1100 / 59535
Train Loss: 0.5130756497383

In [14]:
def generate_text(model,vocab,inverse_vocab, tokenizer, seed_text, num_chars=200, temperature= 0.001):

  text = seed_text  

  for _ in range(num_chars):
    input = np.array(tokenizer.tokenize(text[-101:],vocab))

    preds = model(torch.tensor(input).unsqueeze(0))[0, -1:, :]
    preds = tf.constant(torch.softmax(preds,-1).detach().numpy())
    
    preds = tf.math.log(preds) / temperature
    
    next_char = tf.random.categorical(preds, num_samples=1)
    next_char = inverse_vocab[next_char.numpy()[0][0]]
    

    text += next_char

  return text
    

In [26]:
generate_text(model ,vocab,inverse_vocab,Tokenizer,'i had responded to', num_chars = 100, temperature = 0.1)

'i had responded to put much is bad shates later and now much i just again it hell  i`ll now   ohh? i hade ir arast hav'