In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [40]:
import os
import string

## Keep your training documents in a folder named 'data'
input_data_dir = "data"

# String of punctuation without the full stop
punctuation = string.punctuation.replace('.', '')  # Retain the full stop

def is_hidden(filepath):
    return os.path.basename(filepath).startswith('.')

text_data=""

howManyDocuments = 100
used=0;

for filename in os.listdir(input_data_dir):
    filepath = os.path.join(input_data_dir, filename)
    if not is_hidden(filepath):
        with open(filepath) as infile:
            for line in infile:
                if line.strip():  # Check if line is not just whitespace
                    # Remove all punctuation except full stops
                    for char in punctuation:
                        line = line.replace(char, '')
                    text_data += line
    used=used+1
    if (used==howManyDocuments):
        print(howManyDocuments, "documents used.")
        break


100 documents used.


In [41]:
import nltk
nltk.data.path.append("C:/Users/shabb/nltk_data")  # Change this path if needed
nltk.download('punkt_tab')

# Tokenize the text into words
# Lowercasing for consistency

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shabb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [42]:
words = nltk.word_tokenize(text_data.lower()) 
words

['asian',
 'exporters',
 'fear',
 'damage',
 'from',
 'u',
 '.',
 's',
 '.',
 'japan',
 'rift',
 'mounting',
 'trade',
 'friction',
 'between',
 'the',
 'u',
 '.',
 's',
 '.',
 'and',
 'japan',
 'has',
 'raised',
 'fears',
 'among',
 'many',
 'of',
 'asia',
 's',
 'exporting',
 'nations',
 'that',
 'the',
 'row',
 'could',
 'inflict',
 'far',
 'reaching',
 'economic',
 'damage',
 'businessmen',
 'and',
 'officials',
 'said',
 '.',
 'they',
 'told',
 'reuter',
 'correspondents',
 'in',
 'asian',
 'capitals',
 'a',
 'u',
 '.',
 's',
 '.',
 'move',
 'against',
 'japan',
 'might',
 'boost',
 'protectionist',
 'sentiment',
 'in',
 'the',
 'u',
 '.',
 's',
 '.',
 'and',
 'lead',
 'to',
 'curbs',
 'on',
 'american',
 'imports',
 'of',
 'their',
 'products',
 '.',
 'but',
 'some',
 'exporters',
 'said',
 'that',
 'while',
 'the',
 'conflict',
 'would',
 'hurt',
 'them',
 'in',
 'the',
 'long',
 'run',
 'in',
 'the',
 'short',
 'term',
 'tokyo',
 's',
 'loss',
 'might',
 'be',
 'their',
 'gain'

In [43]:
# Create a set of all unique words and create a dictionary to convert words to integers
word_to_int = {w: i for i, w in enumerate(set(words))}
int_to_word = {i: w for w, i in word_to_int.items()}


In [44]:
len(word_to_int)

3201

In [45]:
# Convert the entire text to integers
encoded_text = np.array([word_to_int[word] for word in words])


In [46]:
# Prepare dataset
def create_sequences(input_data, seq_length):
    sequences = []
    for i in range(0, len(input_data) - seq_length):
        sequence_in = input_data[i:i + seq_length]
        sequence_out = input_data[i + seq_length]
        sequences.append((sequence_in, sequence_out))
    return sequences


In [47]:
seq_length = 20  # Length of input sequences
sequences = create_sequences(encoded_text, seq_length)

In [48]:
sequences

[(array([2150,  339, 2819,  345,  863,   53, 1764, 2812, 1764,  599, 1204,
         2218,  378, 1790, 2736,  683,   53, 1764, 2812, 1764]),
  3039),
 (array([ 339, 2819,  345,  863,   53, 1764, 2812, 1764,  599, 1204, 2218,
          378, 1790, 2736,  683,   53, 1764, 2812, 1764, 3039]),
  599),
 (array([2819,  345,  863,   53, 1764, 2812, 1764,  599, 1204, 2218,  378,
         1790, 2736,  683,   53, 1764, 2812, 1764, 3039,  599]),
  383),
 (array([ 345,  863,   53, 1764, 2812, 1764,  599, 1204, 2218,  378, 1790,
         2736,  683,   53, 1764, 2812, 1764, 3039,  599,  383]),
  376),
 (array([ 863,   53, 1764, 2812, 1764,  599, 1204, 2218,  378, 1790, 2736,
          683,   53, 1764, 2812, 1764, 3039,  599,  383,  376]),
  3012),
 (array([  53, 1764, 2812, 1764,  599, 1204, 2218,  378, 1790, 2736,  683,
           53, 1764, 2812, 1764, 3039,  599,  383,  376, 3012]),
  2761),
 (array([1764, 2812, 1764,  599, 1204, 2218,  378, 1790, 2736,  683,   53,
         1764, 2812, 1764, 3039,  

In [49]:
class RNNModel(nn.Module):
    def __init__(self, 
                 input_size, 
                 output_size, 
                 hidden_dim, 
                 n_layers):
        super(RNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(
            input_size, hidden_dim)
        self.rnn = nn.RNN(
            hidden_dim, hidden_dim, 
            n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.rnn(x, hidden)
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, 
                           batch_size, 
                           self.hidden_dim).to(device)


In [50]:
# Check for GPU
device = torch.device(
    "cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
input_size = len(word_to_int)
output_size = len(word_to_int)
hidden_dim = 256
n_layers = 3
batch_size = 256
epochs = 100

# Initialize model, loss function, and optimizer
model = RNNModel(
    input_size, 
    output_size, 
    hidden_dim, 
    n_layers).to(device)

loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.parameters(), lr=0.001)

# Convert sequences to PyTorch tensors
def batchify(data, batch_size):
    # Prepare inputs and targets
    inputs = [item[0] for item in data]
    targets = [item[1] for item in data]

    # Split data into batches
    n_batches = len(inputs) // batch_size
    inputs = inputs[:n_batches * batch_size]
    targets = targets[:n_batches * batch_size]

    # Batchify
    batched_inputs = [torch.tensor(
        inputs[i:i + batch_size], 
        dtype=torch.long) 
                      for i in range(0, 
                                     len(inputs), 
                                     batch_size)]
    batched_targets = [torch.tensor(
        targets[i:i + batch_size], 
        dtype=torch.long) 
                       for i in range(0, 
                                      len(targets), 
                                      batch_size)]

    return batched_inputs, batched_targets

batched_inputs, batched_targets = batchify(sequences, batch_size)


# Training loop
for epoch in range(epochs):
    for batch_idx in range(len(batched_inputs)):
        inputs, targets = batched_inputs[batch_idx], \
        batched_targets[batch_idx]
        inputs, targets = inputs.to(device), \
        targets.to(device)

        hidden = model.init_hidden(batch_size)
        optimizer.zero_grad()
        output, hidden = model(inputs, hidden)


        # Reshape output to [batch_size, seq_length, output_size]
        output = output.view(batch_size, seq_length, -1)

        # Use only the last output of each sequence
        output_last = output[:, -1, :]

        # Flatten the last outputs
        output_flat = output_last.view(-1, output_size)

        # Flatten the targets
        targets_flat = targets.view(-1)

        # Compute the loss
        loss = loss_function(output_flat, targets_flat)

        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')




Epoch 1, Loss: 6.654281139373779
Epoch 2, Loss: 6.287537574768066
Epoch 3, Loss: 6.127289772033691
Epoch 4, Loss: 5.891533374786377
Epoch 5, Loss: 5.681100845336914
Epoch 6, Loss: 5.278131484985352
Epoch 7, Loss: 4.851686954498291
Epoch 8, Loss: 4.418951988220215
Epoch 9, Loss: 4.035642147064209
Epoch 10, Loss: 3.659735918045044
Epoch 11, Loss: 3.4059441089630127
Epoch 12, Loss: 3.183023452758789
Epoch 13, Loss: 2.914156913757324
Epoch 14, Loss: 2.6718673706054688
Epoch 15, Loss: 2.4683420658111572
Epoch 16, Loss: 2.1596057415008545
Epoch 17, Loss: 1.8815668821334839
Epoch 18, Loss: 1.6806443929672241
Epoch 19, Loss: 1.4783462285995483
Epoch 20, Loss: 1.4005036354064941
Epoch 21, Loss: 1.2410924434661865
Epoch 22, Loss: 1.0572154521942139
Epoch 23, Loss: 0.8833005428314209
Epoch 24, Loss: 0.7890704870223999
Epoch 25, Loss: 0.6817677021026611
Epoch 26, Loss: 0.6178960800170898
Epoch 27, Loss: 0.5320671796798706
Epoch 28, Loss: 0.41577383875846863
Epoch 29, Loss: 0.3706173300743103
Epoch

In [51]:
import torch.nn.functional as F

start_seq = "Some exporters said"
model.eval()
input_seq = [word_to_int[word] 
             for word in start_seq.lower().split()]
input_tensor = torch.tensor([input_seq], 
                            dtype=torch.long).to(device)
hidden = model.init_hidden(1)
output, _ = model(input_tensor, hidden)
probabilities = F.softmax(output[-1], dim=0).detach().cpu()

# Choose from top k probabilities
top_prob, top_idx = torch.topk(probabilities, k=1)
next_word = int_to_word[top_idx.numpy()[0]]

In [52]:
next_word

'.'

In [53]:
import torch.nn.functional as F
import random

def generate_text(model, 
                  start_seq, 
                  word_to_int, 
                  int_to_word, 
                  gen_length=50, 
                  top_k=1):
    """
    Generate text using a trained RNN model.

    Parameters:
    model: Trained RNN model.
    start_seq: Starting sequence for text generation.
    word_to_int: Dictionary mapping words to integers.
    int_to_word: Dictionary mapping integers to words.
    gen_length: Number of words to generate.
    top_k: from top_k next words, randomly select one.
    This ensures that results are different for the
    same prompt.

    Returns:
    Generated text.
    """
    model.eval()  # Set the model to evaluation mode

    input_seq = [word_to_int[word] for word in start_seq.lower().split()]
    generated_text = start_seq

    for _ in range(gen_length):
        input_tensor = torch.tensor([input_seq], dtype=torch.long).to(device)
        hidden = model.init_hidden(1)

        output, _ = model(input_tensor, hidden)
        probabilities = F.softmax(output[-1], dim=0).detach().cpu()

        # Choose from top k probabilities
        top_prob, top_idx = torch.topk(probabilities, k=top_k)
        chosen_idx = random.choice(top_idx.numpy())

        next_word = int_to_word[chosen_idx]
        generated_text += ' ' + next_word

        input_seq.append(chosen_idx)
        input_seq = input_seq[1:]

    return generated_text

def generate_text_temperature(model, start_seq, 
                              word_to_int, 
                              int_to_word, 
                              gen_length=50, 
                              temperature=1.0):
    # In this function, the temperature parameter 
    # is used to scale the logits before applying 
    # softmax. A higher temperature (>1) produces 
    # more randomness, while a lower temperature (<1) 
    # makes the model more confident (but potentially 
    # more repetitive).
    
    model.eval()
    input_seq = [word_to_int[word] for word in start_seq.lower().split()]
    generated_text = start_seq

    for _ in range(gen_length):
        input_tensor = torch.tensor([input_seq], dtype=torch.long).to(device)
        hidden = model.init_hidden(1)

        output, _ = model(input_tensor, hidden)
        output = output / temperature  # Adjusting output with temperature
        probabilities = F.softmax(output[-1], dim=0).detach().cpu()

        next_word_idx = torch.multinomial(probabilities, 1).item()
        next_word = int_to_word[next_word_idx]
        generated_text += ' ' + next_word

        input_seq.append(next_word_idx)
        input_seq = input_seq[1:]

    return generated_text

def beam_search(model, 
                start_seq, 
                word_to_int, 
                int_to_word, 
                beam_width=5, 
                gen_length=50):
    # Initialize beams as a list of tuples (sequence, probability)
    initial_seq = [word_to_int[word] for word in start_seq.lower().split()]
    beams = [(initial_seq, 1.0)]

    for _ in range(gen_length):
        new_beams = []
        for seq, prob in beams:
            input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
            hidden = model.init_hidden(1)

            output, _ = model(input_tensor, hidden)
            probabilities = F.softmax(output[-1], dim=0).detach().cpu().numpy()

            # Consider top beam_width choices for this beam
            top_indices = np.argsort(probabilities)[-beam_width:]
            for idx in top_indices:
                new_seq = seq + [idx]
                new_prob = prob * probabilities[idx]
                new_beams.append((new_seq, new_prob))

        # Select top beam_width beams
        beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_width]

    # Choose the beam with the highest probability
    best_seq, _ = max(beams, key=lambda x: x[1])
    return ' '.join([int_to_word[idx] for idx in best_seq])



In [54]:
start_sequence = "the Business Economics Department"
generated_sequence_length = 500  # Number of words to generate

'''
generated_text = generate_text(model, 
                               start_sequence, 
                               word_to_int, 
                               int_to_word, 
                               generated_sequence_length)

generated_text = generate_text_temperature(
    model, start_sequence, 
    word_to_int, int_to_word, generated_sequence_length)
'''
generated_text = beam_search(model, 
                             start_sequence, 
                             word_to_int, int_to_word, 
                             gen_length=generated_sequence_length)

print(generated_text)

the business economics department said . it said janunary march imports rose to 65 . 1 billion baht from 58 . 7 billion . thailand s improved business climate this year resulted in a 27 pct increase in imports of raw materials and semi finished products . the country s oil import bill however fell 23 pct in the first quarter due to lower oil prices . the department said first quarter exports expanded to 60 . 6 billion baht from 56 . 6 billion . export growth was smaller than expected due to lower earnings from many key commodities including rice whose earnings declined 18 pct maize 66 pct sugar 45 pct tin 26 pct and canned pineapples seven pct . products registering high export growth were jewellery up 64 pct clothing 57 pct and rubber 35 pct .indonesia sees cpo price rising sharply indonesia expects crude palm oil cpo prices to rise sharply to between 450 and 550 dlrs a tonne fob sometime this year because of better european demand and a fall in malaysian output hasrul harahap junior 

the business economics department said . it said janunary march imports rose to 65 . 1 billion baht from 58 . 7 billion . thailand s improved business climate this year resulted in a 27 pct increase in imports of raw materials and semi finished products . the country s oil import bill however fell 23 pct in the first quarter due to lower oil prices . the department said first quarter exports expanded to 60 . 6 billion baht from 56 . 6 billion . export growth was smaller than expected due to lower earnings from many key commodities including rice whose earnings declined 18 pct maize 66 pct sugar 45 pct tin 26 pct and canned pineapples seven pct . products registering high export growth were jewellery up 64 pct clothing 57 pct and rubber 35 pct .indonesia sees cpo price rising sharply indonesia expects crude palm oil cpo prices to rise sharply to between 450 and 550 dlrs a tonne fob sometime this year because of better european demand and a fall in malaysian output hasrul harahap junior minister for tree crops told indonesian reporters . prices of malaysian and sumatran cpo are now around 332 dlrs a tonne cif for delivery in rotterdam traders said . harahap said indonesia would maintain its exports despite making recent palm oil purchases from malaysia so that it could possibly increase its international market share . indonesia the world s second largest producer of palm oil after malaysia has been forced to import palm oil to ensure supplies during the moslem fasting month of ramadan . harahap said it was better to import to cover a temporary shortage than to lose export markets . indonesian exports of cpo in calendar 1986 were 530 500 tonnes against 468 500 in 1985 according to central bank figures .australian foreign ship ban ends but nsw ports hit tug crews in new south wales nsw victoria and western australia yesterday lifted their ban on foreign flag ships carrying containers but nsw ports are still being disrupted by a separate dispute shipping sources said . the ban imposed a week ago over a pay claim had prevented the movement in or out of port of nearly 20 vessels they said . the pay dispute went before a hearing of the arbitration commission today . meanwhile disruption began today to cargo handling in the ports of sydney newcastle and port kembla they said . the industrial action at the nsw ports is part of the week of action called by the nsw trades and labour council to protest changes to the state s workers compensation laws .rubbermaid inc 1st qtr shr 28 cts vs 22 cts rubbermaid inc 1st qtr shr 28 cts vs 22 ctsindependent chairman for dutch cargo dispute the two sides in the rotterdam port general cargo dispute have agreed to appoint an independent chairman han lammers to preside over future meetings employers spokesman gerard zeebregts said . lammers queen s commissioner for the province of flevoland