<a href="https://colab.research.google.com/github/rodrigoromanguzman/Actividades_Aprendizaje-/blob/main/Copy_of_language_model_rnn_from_zero.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [134]:
import numpy as np

# Utility libraries
import random
import re


from google.colab import drive


In [135]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [136]:
news_s_path = "/content/drive/MyDrive/newsSpace"

In [137]:
# Utility function for detecting urls
def is_url(s):
    # A simple regex to check for a basic URL structure
    return re.match(r'https?://', s) is not None


In [138]:
# We will separate the batches as the elements from an individual article
data_articles = []
# Lets collect all the unique ocurrences of each word
# to create the dictionary
vocabulary = set([])
num_articles = 100
counter = 0
with open(news_s_path, encoding='ISO-8859-1') as f:
    while (counter<num_articles):
      line = f.readline()
      if not line:
          break
      data_line = line.strip().split()
      url_index = next((i for i, item in enumerate(data_line) if is_url(item)), None)
      # We will get only the articles which have a url
      if url_index is not None:
        # We take after the url
        article = line.strip().split()[url_index+1::]
        [vocabulary.add(i) for i in article]
        data_articles.append(article)
      counter += 1
print("Vocabulary size -> ", len(vocabulary))
# print(data_articles[:100])
# print(list(vocabulary)[:200])


Vocabulary size ->  1448


In [139]:
# Splitting data into training and test
test_percentage = 0.2
random.shuffle(data_articles)
split_point = int(len(data_articles)*test_percentage)
training_set = data_articles[split_point::]
test_set = data_articles[:split_point:]

In [140]:
# Word embedding initialization with random values
mean = 0
std_dev = 0.01
vocab_size = len(vocabulary)
embedding_size = 150 #Hardcoded value for the size of the vectors
word_embeddings = np.random.normal(mean, std_dev, size=(vocab_size, embedding_size))
hidden_size = 500  # Size of the hidden state vectors

# Create a dictionary to map words to their embedding vectors
word_to_embedding = {}
for i, word in enumerate(vocabulary):
    word_to_embedding[word] = word_embeddings[i]

In [141]:
# Initialize weights randomly
W_e = np.random.normal(mean, std_dev, size=(vocab_size, embedding_size))
W_h = np.random.normal(mean, std_dev, size=(hidden_size,hidden_size))
W_y = np.random.normal(mean, std_dev, size=(hidden_size,vocab_size))# Hidden to output
bh = np.zeros((hidden_size, 1))  # Hidden bias
by = np.zeros((vocab_size, 1))  # Output bias


In [142]:
# One hot encoded vocabulary
vocabulary_list = list(vocabulary)

vocabulary_list.sort()  # Sorting to ensure consistent indexing

# Create a dictionary that maps words to indices based on the sorted order
word_to_index = {word: i for i, word in enumerate(vocabulary)}
word_to_index['<UNK>'] = len(vocabulary)  # Add '<UNK>' token with a new index

In [143]:
def sentence_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index['<UNK>']) for word in sentence.split()]

In [144]:
def softmax(x):
  return np.exp(x)/sum(np.exp(x))

In [145]:
def tahnh(x):
  return np.tanh(x)

In [146]:
# Input the probabilies and the true value
def cross_entropy(y_probs,y_true):
  return -np.sum(y_true * np.log(y_probs))

In [147]:
# Input the sentnece as thie index representation
# parameters: dictionary with keys -> W_e, W_h, W_y, bh, by
def forward_pass(indices, word_embeddings,parameters):
    h_prev = np.zeros((hidden_size, 1))  # Initial hidden state

    states = {'es': {},'hs': {},'ys': {},'ps': {}}
    states['hs'][-1] = np.copy(h_prev)

    # Forward pass
    for t in range(len(indices)):
        states['es'][t] = word_embeddings[indices[t]]  # Embedding vector for current input word
        h_rec = np.dot(parameters['W_h'], h_prev)+ np.dot(parameters['W_e'], states['es'][t]) + parameters['bh']
        states['hs'][t] = tahnh(h_rec)  # Hidden state
        states['ys'][t] = np.dot(parameters['W_y'].T, states['hs'][t]) + parameters['by']  # Unnormalized log probabilities for next words
        states['ps'][t] = softmax(states['ys'][t])  # Probabilities for next words
        h_prev = states['hs'][t]  # Pass the current hidden state to the next time step
    return states

In [148]:
# Backpropagation
# outputs -> dictionary of dictionaries: es, hs, ys, ps
# parameters -> dictionary with: W_h, W_y, bh, by

def backward_pass(inputs,outputs, parameters):
    gradients = {
        'dW_e': np.zeros_like(W_e),
        'dW_h': np.zeros_like(parameters['W_h']),
        'dW_y': np.zeros_like(parameters['W_y']),
        'dbh': np.zeros_like(parameters['bh']),
        'dby': np.zeros_like(parameters['by'])
    }
    dh_next = np.zeros_like(outputs['hs'][0])
    for t in reversed(range(len(inputs)-1)):
        dy = np.copy(outputs['ps'][t])  # Copy softmax probabilities
        true_label_index = inputs[t + 1] if t < len(inputs) - 1 else inputs[t]
        dy[true_label_index] -= 1

        temp_product = np.dot(dy, outputs['hs'][t].T)  # Results in shape (1448, 500)
        gradients['dW_y'] += temp_product.T

        gradients['dby'] += dy
        dh = np.dot(parameters['W_y'].T, dy) + dh_next  # Backprop into h
        dh_rec = (1 - outputs['hs'][t] * outputs['hs'][t]) * dh  # Backprop through tanh nonlinearity
        gradients['dbh'] += dh_rec
        gradients['dW_h'] += np.dot(dh_rec, outputs['hs'][t-1].T)
        gradients['dW_e'] += np.dot(dh_rec, outputs['es'][t].T)
        dh_next = np.dot(parameters['W_h'].T, dh_rec)

    for gradientKey in gradients:
        np.clip(gradients[gradientKey], -5, 5, out=gradients[gradientKey])  # Clip to mitigate exploding gradients
    return gradients


In [149]:
# Update weights
def update_parameters(parameters, gradients, learning_rate):
    # parameters and gradients are dictionaries with the same keys: 'W_e', 'W_h', 'W_y', 'bh', 'by'
    for key in parameters.keys():
        parameters[key] -= learning_rate * gradients['d' + key]

In [150]:

# And we have our parameters 'W_e', 'W_h', 'W_y', 'bh', 'by'
parameters = {
    'W_e': W_e,
    'W_h': W_h,
    'W_y': W_y,
    'bh': bh,
    'by': by
}
def initialize_gradients(parameters):
    gradients = {}
    for key in parameters.keys():
        gradients['d' + key] = np.zeros_like(parameters[key])
    return gradients

# Usage
gradients = initialize_gradients(parameters)
learning_rate = 0.01

In [151]:
# Training
def training(mini_batches,gradients,parameters,vocabulary,learning_rate):

  total_loss = 0
  correct_predictions = 0
  total_predictions = 0

  for batch_idx, sentence in enumerate(data_articles, start=1):
      indices = [word_to_index.get(word, word_to_index['<UNK>']) for word in sentence]  # Tokenize the sentence

      # Forward pass
      outputs = forward_pass(indices, word_embeddings, parameters)

      # Create true labels (next words) for this batch
      true_labels = indices[1:]  # Assuming the next word is the true label for each word in the sentence
      print("Passed forward")
      # Backward pass
      gradients = backward_pass(indices, outputs, parameters)
      print("passed gradient")

      # Update parameters
      update_parameters(parameters, gradients, learning_rate)

      # Calculate and print loss
      loss = cross_entropy(outputs['ps'], true_labels)
      total_loss += loss
      print(f"Batch {batch_idx}, Loss: {loss}")

      # Calculate accuracy for this batch
      predicted_words = [vocabulary_list[np.argmax(output)] for output in outputs['ps']]
      correct_predictions += sum([1 for predicted, true in zip(predicted_words, true_labels) if predicted == true])
      total_predictions += len(true_labels)

      # Calculate and print accuracy on the training data at specified intervals
      if batch_idx % 20 == 0:
          accuracy = correct_predictions / total_predictions
          print(f"Batch {batch_idx}, Training Accuracy: {accuracy:.4f}")

  # Print average loss and final training accuracy
  avg_loss = total_loss / len(data_articles)
  final_accuracy = correct_predictions / total_predictions
  print(f"Average Loss: {avg_loss:.4f}")
  print(f"Final Training Accuracy: {final_accuracy:.4f}")



In [152]:
training(data_articles, gradients, parameters, vocabulary, learning_rate)

Passed forward


ValueError: ignored