<a href="https://colab.research.google.com/github/rodrigoromanguzman/Actividades_Aprendizaje-/blob/main/language_model_rnn_from_zero.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

# Utility libraries
import random
import re


from google.colab import drive


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
news_s_path = "/content/drive/MyDrive/newsSpace"

In [4]:
# Utility function for detecting urls
def is_url(s):
    # A simple regex to check for a basic URL structure
    return re.match(r'https?://', s) is not None


In [5]:
# We will separate the batches as the elements from an individual article
data_articles = []
# Lets collect all the unique ocurrences of each word
# to create the dictionary
vocabulary = set([])
with open(news_s_path, encoding='ISO-8859-1') as f:
    while True:
        line = f.readline()
        if not line:
            break
        data_line = line.strip().split()
        url_index = next((i for i, item in enumerate(data_line) if is_url(item)), None)
        # We will get only the articles which have a url
        if url_index is not None:
          # We take after the url
          article = line.strip().split()[url_index+1::]
          [vocabulary.add(i) for i in article]
          data_articles.append(article)
print(data_articles[:100])
print(list(vocabulary)[:200])


['requests:', 'galactico', 'Transgender', 'tips?\\', '&quot;Mahatma&quot;', 'Fagernes.', 'Governments', 'src="http://feeds.feedburner.com/~r/time/topstories/~4/221838897"', '04:32:40', "'kidnap'", 'http___wwwi.reuters.com_images_w148_amdf809543.jpg.gif', 'Argylls', 'one-on-one,', "medicine'", 'oxcarbazepine', 'assets', 'waives', 'five-man', 'http___www.repubblica.it_2006_09_sezioni_scuola_e_universita_servizi_test-universit-_interrogazione-universit-_sian_11113064_29500.jpg.gif', 'http___newsimg.bbc.co.uk_media_images_42395000_jpg__42395427_impersonator_ap6666.jpg.gif', 'http___im.rediff.com_movies_2006_sep_29krishna1.jpg.gif', "'Dexter'", 'Willia...', 'waterlogged', '&quot;cock', 'Married...She', '64G', '&#39;Package', 'Writting', '22:58:09', '16:30:42', 'Propeller', 'Brabeck-Letmathe', 'cabbage', 'Geneva.\\', 'Donna</i>', 'href="http://eu.mywayfinder.com/index.us.php">MyWayfinder.com</a>,', 'src="http://www.comics.com/creators/liberty/archive/images/liberty2610850061211.gif"', 'eways

In [6]:
# Splitting data into training and test
test_percentage = 0.2
random.shuffle(data_articles)
split_point = int(len(data_articles)*test_percentage)
training_set = data_articles[split_point::]
test_set = data_articles[:split_point:]

In [7]:
# Word embedding initialization with random values
mean = 0
std_dev = 0.01
vocab_size = len(vocabulary)
embedding_size = 150 #Hardcoded value for the size of the vectors
word_embeddings = np.random.normal(mean, std_dev, size=(vocab_size, embedding_size))
hidden_size = 500  # Size of the hidden state vectors

# Create a dictionary to map words to their embedding vectors
word_to_embedding = {}
for i, word in enumerate(vocabulary):
    word_to_embedding[word] = word_embeddings[i]

In [None]:
# Initialize weights randomly
W_e = np.random.normal(mean, std_dev, size=(vocab_size, embedding_size))
W_h = np.random.normal(mean, std_dev, size=(hidden_size,hidden_size))
W_y = np.random.normal(mean, std_dev, size=(hidden_size,hidden_size))# Hidden to output
bh = np.zeros((hidden_size, 1))  # Hidden bias
by = np.zeros((vocab_size, 1))  # Output bias


In [None]:
# One hot encoded vocabulary
vocabulary_list = list(vocabulary)

vocabulary_list.sort()  # Sorting to ensure consistent indexing

# Create a dictionary that maps words to indices based on the sorted order
word_to_index = {word: i for i, word in enumerate(vocabulary)}

In [None]:
def sentence_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index['<UNK>']) for word in sentence.split()]

In [None]:
def softmax(x):
  return np.exp(x)/sum(np.exp(x))

In [None]:
def tahnh(x):
  return np.tanh(x)

In [None]:
# Input the probabilies and the true value
def cross_entropy(y_probs,y_true):
  return -np.sum(y_true * np.log(y_probs))

In [None]:
# Input the sentnece as thie index representation
# parameters: dictionary with keys -> W_e, W_h, W_y, bh, by
def forward_pass(indices, word_embeddings,parameters):
    h_prev = np.zeros((hidden_size, 1))  # Initial hidden state

    states = {'es': {},'hs': {},'ys': {},'ps': {}}
    states['hs'][-1] = np.copy(h_prev)

    # Forward pass
    for t in range(len(indices)):
        states['es'][t] = word_embeddings[indices[t]]  # Embedding vector for current input word
        h_rec = np.dot(parameters['W_h'], h_prev)+ np.dot(parameters['W_e'], states['es'][t]) + parameters['bh']
        states['hs'][t] = tahnh(h_rec)  # Hidden state
        states['ys'][t] = np.dot(parameters['W_y'], states['hs'][t]) + parameters['by']  # Unnormalized log probabilities for next words
        states['ps'][t] = softmax(states['ys'][t])  # Probabilities for next words
        h_prev = states['hs'][t]  # Pass the current hidden state to the next time step
    return states

In [None]:
# Backpropagation
# outputs -> dictionary of dictionaries: es, hs, ys, ps
# parameters -> dictionary with: W_h, W_y, bh, by

def backward_pass(inputs,outputs, parameters):
    gradients = {
        'dW_e': np.zeros_like(W_e),
        'dW_h': np.zeros_like(parameters['W_h']),
        'dW_y': np.zeros_like(parameters['W_y']),
        'dbh': np.zeros_like(parameters['bh']),
        'dby': np.zeros_like(parameters['by'])
    }
    dh_next = np.zeros_like(parameters['hs'][0])
    updated_parameters =
    for t in reversed(range(len(inputs))):
        dy = np.copy(outputs['ps'][t])
        dy[outputs['ys'][t]] -= 1  # Backprop into y
        gradients['dW_y'] += np.dot(dy, outputs['hs'][t].T)
        gradients['dby'] += dy
        dh = np.dot(parameters['W_y'].T, dy) + dh_next  # Backprop into h
        dh_rec = (1 - outputs['hs'][t] * outputs['hs'][t]) * dh  # Backprop through tanh nonlinearity
        gradients['dbh'] += dh_rec
        gradients['dW_h'] += np.dot(dh_rec, outputs['hs'][t-1].T)
        gradients['dW_e'] += np.dot(dh_rec, outputs['es'][t].T)
        dh_next = np.dot(parameters['W_h'].T, dh_rec)

    for gradientKey in gradients:
        np.clip(gradients[gradientKey], -5, 5, out=gradients[gradientKey])  # Clip to mitigate exploding gradients
    return gradients


In [None]:
# Update weights
def update_parameters(parameters, gradients, learning_rate):
    # parameters and gradients are dictionaries with the same keys: 'W_e', 'W_h', 'W_y', 'bh', 'by'
    for key in parameters.keys():
        parameters[key] -= learning_rate * gradients['d' + key]

In [None]:
gradients = {
    'dW_e': dW_e,
    'dW_h': dW_h,
    'dW_y': dW_y,
    'dbh': dbh,
    'dby': dby
}

# And we have our parameters 'W_e', 'W_h', 'W_y', 'bh', 'by'
parameters = {
    'W_e': W_e,
    'W_h': W_h,
    'W_y': W_y,
    'bh': bh,
    'by': by
}


In [None]:
# Training
def training(mini_batches,gradients,parameters,vocabulary,learning_rate):
  for batch in mini_batches:

    indices = word_to_index(batch)
    # forward_pass(indices, word_embeddings,parameters)
    outputs = forward_pass(indices,word_embeddings,parameters)
    # backward_pass(inputs,outputs, parameters)
    gradients = backward_pass(indices,outputs,parameters)
    update_parameters(parameters,gradients,learning_rate)

    # Computing Loss

