In [1]:
import numpy as np
import pandas as pd

import torch
import torchtext

import tensorflow as tf

import spacy
nlp = spacy.load('en_core_web_sm')

from tensorflow.keras.preprocessing.text import Tokenizer

2024-09-21 18:56:32.600423: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
df = pd.read_csv('/Users/nazarlenisin/Desktop/Text Generation/Tweets.csv')
df = df['text']
df.head(3)

0               I`d have responded, if I were going
1     Sooo SAD I will miss you here in San Diego!!!
2                         my boss is bullying me...
Name: text, dtype: object

In [3]:
data = ' '.join(df.values.astype(str))
data = [token.lower() for token in data][:60000]

In [4]:
tokenizer = Tokenizer(char_level = True)

tokenizer.fit_on_texts(data)

tokenized_data = tokenizer.texts_to_sequences(data)
tokenized_data = [token[0] for token in tokenized_data]

In [5]:
def windows(data,window_size):
  windows = []

  index = 0
  window_size += 1

  while index + window_size <= len(data) - 1:
    windows.append(data[index: index +window_size])

    index += 1

  return np.array(windows)

In [6]:
window_size = 101

window_data = windows(tokenized_data,window_size)

In [7]:
X = window_data[:,:-1]
y = window_data[:,1:]

X = tf.keras.utils.to_categorical(X)

In [8]:
class Generative_lstm(tf.keras.Model):
  def __init__(self,hidden_size,vocab_size):
    super().__init__()
    self.hidden_size = hidden_size
    self.vocab_size = spacy.vocab

    self.lstm_1 = tf.keras.layers.LSTM(hidden_size,return_sequences = True)
    self.lstm_2 = tf.keras.layers.LSTM(hidden_size,return_sequences = True)

    self.linear = tf.keras.layers.Dense(vocab_size,activation = 'softmax')

  def call(self,X):
    X = self.lstm_1(X)
    X = self.lstm_2(X)
    X = self.linear(X)

    return X

In [9]:
def generate_text(model, tokenizer, seed_text, num_chars=200, temperature=1):

  text = seed_text  

  for _ in range(num_chars):
    input = np.array(tokenizer.texts_to_sequences([text[-window_size:]]))
    rec = tokenizer.sequences_to_texts(input)

    input = tf.one_hot(input, vocab_size)

    preds = model.predict(input)[0, -1:, :]
    preds = tf.math.log(preds) / temperature
    
    next_char = tf.random.categorical(preds, num_samples=1)
    next_char = tokenizer.sequences_to_texts(next_char.numpy())[0]

    text += next_char

  return text

In [10]:
hidden_size = 128
vocab_size = len(tokenizer.word_index) + 1

model = Generative_lstm(hidden_size,vocab_size)


model.compile(
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = False),
    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001),
    metrics = ['accuracy']
)

model.fit(X,y,batch_size = 32,epochs = 15)

Epoch 1/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 121ms/step - accuracy: 0.2658 - loss: 2.7057
Epoch 2/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 111ms/step - accuracy: 0.4149 - loss: 2.0333
Epoch 3/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 106ms/step - accuracy: 0.4772 - loss: 1.8035
Epoch 4/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 114ms/step - accuracy: 0.5257 - loss: 1.6167
Epoch 5/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 119ms/step - accuracy: 0.5801 - loss: 1.4226
Epoch 6/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 115ms/step - accuracy: 0.6461 - loss: 1.2066
Epoch 7/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m222s[0m 119ms/step - accuracy: 0.7200 - loss: 0.9769
Epoch 8/15
[1m1872/1872[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 116ms/step - accuracy: 0.7895 - loss:

<keras.src.callbacks.history.History at 0x16ccdeb50>

In [12]:
print(generate_text(model, tokenizer, "That what i was", num_chars = 300, temperature = 0.001))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22