<a href="https://colab.research.google.com/github/spatiebalk/text_mining_project/blob/master/text_generation_GRU_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text generation GRU with news data

In [1]:
import numpy as np
import tensorflow as tf
import os
from os.path import join, isfile
import time
import csv
from tqdm import tqdm
import json
from os import listdir
from os.path import isfile, join
import sys

In [2]:
import sys
print(sys.version)
print(tf.__version__)

3.6.9 (default, Oct  8 2020, 12:12:24) 
[GCC 8.4.0]
2.4.0


In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


### News data

In [4]:
# news_dir = '/content/gdrive/My Drive/TxMM/news_data'
# files = [f for f in listdir(news_dir) if isfile(join(news_dir, f))]
# text = ""

# csv.field_size_limit(sys.maxsize)

# for f in files:
#   with open(join(news_dir, f)) as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     line_count = 0
#     for row in tqdm(csv_reader):
#       if line_count == 0:
#         print(f'Column names are {", ".join(row)}')
#         line_count += 1
#       else:
#         text = text + " " + row[-1] + " "
#         line_count += 1
#   break

# with open('/content/gdrive/My Drive/TxMM/news_data.txt', 'w') as outfile:
    
#     outfile.write(text)

In [5]:
text = open('/content/gdrive/My Drive/TxMM/news_data.txt').read()

In [None]:
vocab = sorted(set(text))
char2index = {u:i for i, u in enumerate(vocab)}
index2char = np.array(vocab)
text_as_int = np.array([char2index[c] for c in text])

In [None]:
seq_length = 25
examples_per_epoch = len(text)//(seq_length+1)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

def split_input_target(data):
  input_text = data[:-1]
  target_text = data[1:]
  return input_text, target_text

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [None]:
vocab_size = len(vocab)
embedding_dim = 300

# Number of RNN units 
rnn_units1 = 512
rnn_units2 = 256
rnn_units= [rnn_units1, rnn_units2]

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
       batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units1, return_sequences=True,
       stateful=True,recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units2, return_sequences=True,
       stateful=True,recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

model = build_model(
vocab_size = vocab_size,
embedding_dim=embedding_dim,
rnn_units=rnn_units,
batch_size=BATCH_SIZE)

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels,
         logits, from_logits=True)
  
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])
model.summary()

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = '/content/gdrive/My Drive/TxMM/GRU_results/training_checkpoints_GRU_news'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
   filepath=checkpoint_prefix, save_weights_only=True)

In [None]:
EPOCHS= 25
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])
latest_check = tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(latest_check)
model.build(tf.TensorShape([1, None]))
model.summary()

In [None]:
def generate_text(model, start_string):

  num_generate = 1000  #can be anything you like
  input_eval = [char2index[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []
  scaling = 0.5 #kept at a lower value here
  # Here batch size == 1
  model.reset_states()

  for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a categorical distribution to predict the character returned by the model
    predictions = predictions / scaling
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

    # Pass the predicted character as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)

    text_generated.append(index2char[predicted_id])

  return (start_string + "".join(text_generated))

In [None]:
# open prompts file
prompts = open('/content/gdrive/My Drive/TxMM/news_prompts.txt', 'r').readlines()
generated_dict = {}

for i, p in tqdm(enumerate(prompts)):
  # generate 1000 characters
  gen_text = generate_text(model, start_string=p)
  generated_dict[i] = gen_text

with open('/content/gdrive/My Drive/TxMM/GRU_results/news_generated_texts.json', 'w') as fp:
  json.dump(generated_dict, fp)