<a href="https://colab.research.google.com/github/rodrigoromanguzman/Actividades_Aprendizaje-/blob/main/rnn_from_zero_dec_21.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import random

In [2]:
from google.colab import drive

In [3]:
news_s_path = "/content/drive/MyDrive/newsSpace"

<h2>Data Preprocessing Functions</h2>

In [4]:
import re
def is_url(s):
    # A simple regex to check for a basic URL structure
    return re.match(r'https?://', s) is not None
def tokenize_article(line):
  url_index = next((i for i, item in enumerate(line) if is_url(item)), None)
  if url_index is not None:
    return re.split(r'[ ,.;:!?()]+', ' '.join(line[url_index+1:]))
  return None


def process_file(filepath, num_articles):
  articles = []
  vocabulary = set()
  pattern = re.compile(r'[ ,.;:!?()]+')
  word_pattern = re.compile(r"\b[A-Za-z]+'?[A-Za-z]*(?=\s|\b)")
  try:
    with open(filepath, encoding='ISO-8859-1') as file:
      data = file.read()
      pattern = re.compile(r"\((Reuters|AP)\)[\t\n]+(.*?)[\t\n]+\d+[\t\n]+[0-9]{4}-[0-9]{2}-[0-9]{2}", re.DOTALL)
      raw_articles = pattern.findall(data)
      print("amount of articles")
      print(len(raw_articles))

      for article in raw_articles:
        if len(articles) < num_articles:
          article_text = article[1].strip()
          # Cleaning and processing the article text
          words = word_pattern.findall(article_text.lower())
          # cleaned_article = ' '.join(words)
          articles.append(words)
          # Update vocabulary
          vocabulary.update(words)
  except IOError:
    print("Error opening or reading the file")
    return [], set()
  return articles, vocabulary



<h2>Data Loading</h2>


In [5]:
num_articles = 10
news_s_path = "/content/drive/MyDrive/newsSpace"

data_articles, vocabulary = process_file(news_s_path, num_articles)
print(data_articles)

amount of articles
57469
[['none', 'business', 'reuters', 'wall', "street's", 'long', 'playing', 'drama', 'waiting', 'for', 'google', 'is', 'about', 'to', 'reach', 'its', 'final', 'act', 'but', 'its', 'stock', 'market', 'debut', 'is', 'ending', 'up', 'as', 'more', 'of', 'a', 'nostalgia', 'event', 'than', 'the', 'catalyst', 'for', 'a', 'new', 'era'], ['none', 'business', 'reuters', 'short', 'sellers', 'wall', "street's", 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], ['none', 'business', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['none', 'business', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'exp

In [6]:
data_articles

[['none',
  'business',
  'reuters',
  'wall',
  "street's",
  'long',
  'playing',
  'drama',
  'waiting',
  'for',
  'google',
  'is',
  'about',
  'to',
  'reach',
  'its',
  'final',
  'act',
  'but',
  'its',
  'stock',
  'market',
  'debut',
  'is',
  'ending',
  'up',
  'as',
  'more',
  'of',
  'a',
  'nostalgia',
  'event',
  'than',
  'the',
  'catalyst',
  'for',
  'a',
  'new',
  'era'],
 ['none',
  'business',
  'reuters',
  'short',
  'sellers',
  'wall',
  "street's",
  'dwindling',
  'band',
  'of',
  'ultra',
  'cynics',
  'are',
  'seeing',
  'green',
  'again'],
 ['none',
  'business',
  'reuters',
  'private',
  'investment',
  'firm',
  'carlyle',
  'group',
  'which',
  'has',
  'a',
  'reputation',
  'for',
  'making',
  'well',
  'timed',
  'and',
  'occasionally',
  'controversial',
  'plays',
  'in',
  'the',
  'defense',
  'industry',
  'has',
  'quietly',
  'placed',
  'its',
  'bets',
  'on',
  'another',
  'part',
  'of',
  'the',
  'market'],
 ['none',
  

<h3>Split data into train and test</h3>
<p>Split data into training and testing sets, and further into input and target pairs where the target is the next word</p>

In [7]:

# Preprocess articles to create input-target pairs
def create_input_target(articles):
  article_targets = []
  for article in articles:
    target_article = []
    for i in range(len(article) - 1):
      target_word = article[i + 1]
      target_article.append(target_word)
    article_targets.append(target_article)
  return article_targets

# Split data into training and test sets
def split_data(data, test_percentage):
  split_point = int(len(data) * test_percentage)
  test_set = data[:split_point]
  training_set = data[split_point:]
  return training_set, test_set


# Example usage
test_percentage = 0.2

# Create input-target pairs
article_targets  = create_input_target(data_articles)
print("Input target pairs")
print(article_targets)

# Vocabulary
word_to_idx = {ch:i for (i,ch) in enumerate(list(vocabulary))}
idx_to_word = {i:ch for (i,ch) in enumerate(list(vocabulary))}

# Take input-target as x and y
x_train_words, x_test_words = split_data(data_articles,test_percentage)
y_train_words, y_test_words = split_data(article_targets,test_percentage)

# Change the data to their index versions
x_train = [[word_to_idx[word] for word in article if word in word_to_idx] for article in x_train_words]
y_train = [[word_to_idx[word] for word in article if word in word_to_idx] for article in y_train_words]
x_test = [[word_to_idx[word] for word in article if word in word_to_idx] for article in x_test_words]
y_test = [[word_to_idx[word] for word in article if word in word_to_idx] for article in y_test_words]


Input target pairs
[['business', 'reuters', 'wall', "street's", 'long', 'playing', 'drama', 'waiting', 'for', 'google', 'is', 'about', 'to', 'reach', 'its', 'final', 'act', 'but', 'its', 'stock', 'market', 'debut', 'is', 'ending', 'up', 'as', 'more', 'of', 'a', 'nostalgia', 'event', 'than', 'the', 'catalyst', 'for', 'a', 'new', 'era'], ['business', 'reuters', 'short', 'sellers', 'wall', "street's", 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], ['business', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['business', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'expected', 'to', 'hang', 'over', 'the', '

In [8]:
print(x_train_words[:100])
print(y_train_words[:100])

[['none', 'business', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'group', 'which', 'has', 'a', 'reputation', 'for', 'making', 'well', 'timed', 'and', 'occasionally', 'controversial', 'plays', 'in', 'the', 'defense', 'industry', 'has', 'quietly', 'placed', 'its', 'bets', 'on', 'another', 'part', 'of', 'the', 'market'], ['none', 'business', 'reuters', 'soaring', 'crude', 'prices', 'plus', 'worries', 'about', 'the', 'economy', 'and', 'the', 'outlook', 'for', 'earnings', 'are', 'expected', 'to', 'hang', 'over', 'the', 'stock', 'market', 'next', 'week', 'during', 'the', 'depth', 'of', 'the', 'summer', 'doldrums'], ['none', 'business', 'reuters', 'authorities', 'have', 'halted', 'oil', 'export', 'flows', 'from', 'the', 'main', 'pipeline', 'in', 'southern', 'iraq', 'after', 'intelligence', 'showed', 'a', 'rebel', 'militia', 'could', 'strike', 'infrastructure', 'an', 'oil', 'official', 'said', 'on', 'saturday'], ['none', 'business', 'reuters', 'stocks', 'ended', 'slightly', 'higher

In [9]:
print(x_train[0])
print(y_train[0])

[9, 144, 24, 71, 34, 27, 142, 134, 40, 20, 175, 192, 173, 22, 96, 206, 35, 195, 141, 107, 105, 123, 212, 90, 20, 154, 160, 112, 191, 74, 164, 120, 136, 123, 207]
[144, 24, 71, 34, 27, 142, 134, 40, 20, 175, 192, 173, 22, 96, 206, 35, 195, 141, 107, 105, 123, 212, 90, 20, 154, 160, 112, 191, 74, 164, 120, 136, 123, 207]


<h3>The RNN Model</h3>

In [10]:
class RNN:
  def __init__(self, hidden_size,vocab_size,learning_rate):
    self.hidden_size = hidden_size
    self.vocab_size = vocab_size
    # self.embedding_size = embedding_size
    self.learning_rate = learning_rate

    # Model parameters
    self.W_e = np.random.uniform(-np.sqrt(1./vocab_size), np.sqrt(1./vocab_size), (hidden_size, vocab_size))
    self.W_y = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (vocab_size, hidden_size))
    self.W_h = np.random.uniform(-np.sqrt(1./hidden_size), np.sqrt(1./hidden_size), (hidden_size, hidden_size))
    self.bh = np.zeros((hidden_size, 1)) # bias for hidden layer
    self.by = np.zeros((vocab_size, 1)) # bias for output

  def softmax(self,x):
    shift_x = x - np.max(x)
    exp_shift_x = np.exp(shift_x)
    softmax_output = exp_shift_x / np.sum(exp_shift_x)
    return softmax_output

  def cross_entropy(self, probs,targets):
    loss = 0
    epsilon = 1e-9  # Small constant for numerical stability
    for t in range(len(targets)):
        # Ensuring the probability is not zero or negative
        prob = max(probs[t][targets[t]][0], epsilon)
        loss += -np.log(prob)
    return loss


  def forward(self,inputs,hprev):
    es,hs,ys = {},{},{}
    ps = {i: 0 for i in range(self.vocab_size)}
    hs[-1] = np.copy(hprev)
    for t in range(len(inputs)):
      es[t] = np.zeros((self.vocab_size,1))
      es[t][inputs[t]] = 1 # one hot encoding , 1-of-k
      hs[t] = np.tanh(np.dot(self.W_e,es[t]) + np.dot(self.W_h,hs[t-1]) + self.bh) # hidden state
      ps[t] = np.dot(self.W_y,hs[t]) + self.by # unnormalised log probs for next char
      ys[t] = self.softmax(ps[t])
    return es,hs,ps,ys

  def backward(self,es,hs,ps,targets):
      dW_e, dW_h, dW_y =  np.zeros_like(self.W_e),np.zeros_like(self.W_h),np.zeros_like(self.W_y)
      dbh, dby =  np.zeros_like(self.bh),np.zeros_like(self.by)
      dh_next = np.zeros_like(hs[0])
      for t in reversed(range(len(targets))):
        dy = np.copy(ps[t])
        # print("length of dy")
        # print(len(dy))
        # print("vocabulary")
        # print(len(vocabulary))
        # Softmax
        dy[targets[t]] -= 1
        # Compute gradients
        dW_y += np.dot(dy,hs[t].T)
        dby += dy
        dh = np.dot(self.W_y.T, dy) + dh_next
        dh_rec = (1 - hs[t] * hs[t]) * dh
        dbh += dh_rec
        dW_e += np.dot(dh_rec, es[t].T)
        dW_h += np.dot(dh_rec, hs[t-1].T)
        dh_next = np.dot(self.W_h, dh_rec)
      return dW_e, dW_h, dW_y, dh, dy

  def update_parameters(self, dW_e, dW_h, dW_y, dbh, dby):
    self.W_e -= self.learning_rate * dW_e
    self.W_h -= self.learning_rate * dW_h
    self.W_y -= self.learning_rate * dW_y
    self.bh -= self.learning_rate * dbh
    self.by -= self.learning_rate * dby

  # def sample(self, h, seed_ix, n):
  #   """
  #   sample a sequence of integers from the model
  #   h is memory state, seed_ix is seed letter from the first time step
  #   """
  #   x = zero_init(self.vocab_size, 1)
  #   x[seed_ix] = 1
  #   ixes = []
  #   for t in range(n):
  #     h = np.tanh(np.dot(self.W_e, x) + np.dot(self.W_h, h) + self.bh)
  #     y = np.dot(self.W_y, h) + self.by
  #     p = np.exp(y)/np.sum(np.exp(y))
  #     ix = np.random.choice(range(self.vocab_size), p = p.ravel())
  #     x = zero_init(self.vocab_size,1)
  #     x[ix] = 1
  #     ixes.append(ix)
  #   return ixes

  def train(self,x_train,y_train,epochs):
    for epoch in range(epochs):

      # print("x_train")
      # print(x_train[:100])
      print("epoch ------> ", epoch)
      rand_print =  random.randint(1, len(x_train)-1)
      for batch_idx, inputs in enumerate(x_train):
        total_loss = 0
        correct_predictions = 0
        total_predictions = 0
        if batch_idx==0:
          h_prev = np.zeros((self.hidden_size,1))
        targets = y_train[batch_idx]
        es,hs,ps,ys = self.forward(inputs, h_prev)
        dW_e, dW_h, dW_y, dh, dy = self.backward(es, hs, ps, targets)
        loss = self.cross_entropy(ps, targets)

        self.update_parameters(dW_e, dW_h, dW_y, dh, dy)
        h_prev = hs[len(inputs)-1]
        for t in range(len(targets)):
          predicted_index = np.argmax(ys[t])
          # print("predicted index")
          # print(predicted_index)
          correct_predictions += (predicted_index == targets[t])
          total_predictions += 1
        if(batch_idx == rand_print):
          print("the input sentence")
          print([idx_to_word[i] for i in targets])
          print("the predicted")
          print([idx_to_word[np.argmax(ys[i])] for i in range(len(targets))])
          print("correct preds")
          print(correct_predictions)
          accuracy = correct_predictions/total_predictions
          print("Accuracy -> ", accuracy)
          print("Loss -> ", loss)
          # print( "\n Accuracy :%d, loss:%f"%(accuracy, loss))


In [11]:
print(len(vocabulary))

216


In [13]:
epochs = 800

vocabulary_size = len(vocabulary)
embedding_size = 400
hidden_size = 1000
#  def __init__(self, hidden_size,vocab_size,embedding_size,learning_rate):
#rnn = RNN(hidden_size=hidden_size, vocab_size=vocabulary_size,learning_rate=0.0001)

rnn = RNN(hidden_size=hidden_size, vocab_size=vocabulary_size,learning_rate=0.01)
rnn.train(x_train,y_train,epochs)

epoch ------>  0
the input sentence
['sci', 'tech', 'reuters', 'a', 'group', 'of', 'technology', 'companies', 'including', 'texas', 'instruments', 'inc', 'txn', 'n', 'stmicroelectronics', 'stm', 'pa', 'and', 'broadcom', 'corp', 'brcm', 'o', 'on', 'thursday', 'said', 'they', 'will', 'propose', 'a', 'new', 'wireless', 'networking', 'standard', 'up', 'to', 'times', 'the', 'speed', 'of', 'the', 'current', 'generation']
the predicted
['business', 'propose', 'summer', 'depth', 'little', 'and', 'the', 'drama', 'band', 'reuters', 'sci', 'sci', 'office', 'think', 'business', 'by', 'football', 'business', 'business', 'minority', 'fans', 'sign', 'stocks', 'america', 'tech', 'era', 'halted', 'pa', 'authorities', 'sick', 'and', 'business', 'broadcom', 'plus', 'business', 'week', 'said', 'football', 'company', 'the', 'band', 'business']
correct preds
1
Accuracy ->  0.023809523809523808
Loss ->  449.719534971831
epoch ------>  1
the input sentence
['sci', 'tech', 'reuters', 'a', 'group', 'of', 'techn

KeyboardInterrupt: ignored