<a href="https://colab.research.google.com/github/shikha-aggarwal/nlp/blob/master/character_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Thanks to Karpathy! https://gist.github.com/karpathy/d4dee566867f8291f086


"""
Minimal character-level Vanilla RNN model. Written by Andrej Karpathy (@karpathy)
BSD License
"""
import numpy as np
from bs4 import BeautifulSoup

from google.colab import drive
import os

In [1]:
# data I/O

drive.mount("/content/drive") 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [78]:
# This directory is simply a dump of htmls using `wget -r http://www.paulgraham.com/articles.html`
data_path = "/content/drive/My Drive/Colab Notebooks/datasets/www.paulgraham.com/"

html_text = ""

for filename in os.listdir(data_path):
   with open(os.path.join(data_path, filename), 'r', encoding = 'cp1252') as f:
     html_text += f.read()

In [31]:
# from here: https://stackoverflow.com/questions/30565404/remove-all-style-scripts-and-html-tags-from-an-html-page/30565420

def cleanMe(html):
    soup = BeautifulSoup(html, "html.parser") # create a new bs4 object from the html data loaded
    for script in soup(["script", "style"]): # remove all javascript and stylesheet code
        script.extract()
    # get text
    text = soup.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [38]:
plain_text = cleanMe(html_text)

In [47]:
data = plain_text # should be simple plain text file
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print ('data has %d characters, %d unique.' % (data_size, vocab_size))
char_to_ix = { ch:i for i,ch in enumerate(chars) }
ix_to_char = { i:ch for i,ch in enumerate(chars) }

data has 3236269 characters, 112 unique.


In [48]:
# hyperparameters

hidden_size = 100 # size of hidden layer of neurons
seq_length = 25 # number of steps to unroll the RNN for
learning_rate = 1e-1

In [49]:
# model parameters

Wxh = np.random.randn(hidden_size, vocab_size)*0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)*0.01 # hidden to hidden
Why = np.random.randn(vocab_size, hidden_size)*0.01 # hidden to output
bh = np.zeros((hidden_size, 1)) # hidden bias
by = np.zeros((vocab_size, 1)) # output bias

In [60]:
def lossFun(inputs, targets, hprev):
  """
  inputs,targets are both list of integers.
  hprev is Hx1 array of initial hidden state
  returns the loss, gradients on model parameters, and last hidden state
  """
  xs, hs, ys, ps = {}, {}, {}, {}
  hs[-1] = np.copy(hprev)
  loss = 0
  # forward pass
  for t in range(len(inputs)):
    xs[t] = np.zeros((vocab_size,1)) # encode in 1-of-k representation
    xs[t][inputs[t]] = 1
    hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state
    ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars
    ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t])) # probabilities for next chars
    loss += -np.log(ps[t][targets[t],0]) # softmax (cross-entropy loss)
  # backward pass: compute gradients going backwards
  dWxh, dWhh, dWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
  dbh, dby = np.zeros_like(bh), np.zeros_like(by)
  dhnext = np.zeros_like(hs[0])
  for t in reversed(range(len(inputs))):
    dy = np.copy(ps[t])
    dy[targets[t]] -= 1 # backprop into y. see http://cs231n.github.io/neural-networks-case-study/#grad if confused here
    dWhy += np.dot(dy, hs[t].T)
    dby += dy
    dh = np.dot(Why.T, dy) + dhnext # backprop into h
    dhraw = (1 - hs[t] * hs[t]) * dh # backprop through tanh nonlinearity
    dbh += dhraw
    dWxh += np.dot(dhraw, xs[t].T)
    dWhh += np.dot(dhraw, hs[t-1].T)
    dhnext = np.dot(Whh.T, dhraw)
  for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
    np.clip(dparam, -5, 5, out=dparam) # clip to mitigate exploding gradients
  return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

In [70]:
def sample(h, seed_ix, n, softmax_temp = 1):
  """ 
  sample a sequence of integers from the model 
  h is memory state, seed_ix is seed letter for first time step
  """
  x = np.zeros((vocab_size, 1))
  x[seed_ix] = 1
  ixes = []
  for t in range(n):
    h = np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
    y = np.dot(Why, h) + by
    y_temp = y / softmax_temp
    # p = np.exp(y) / np.sum(np.exp(y))
    p = np.exp(y_temp) / np.sum(np.exp(y_temp))
    ix = np.random.choice(range(vocab_size), p=p.ravel())
    x = np.zeros((vocab_size, 1))
    x[ix] = 1
    ixes.append(ix)
  return ixes

In [67]:
# Train

n, p = 0, 0
mWxh, mWhh, mWhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
mbh, mby = np.zeros_like(bh), np.zeros_like(by) # memory variables for Adagrad
smooth_loss = -np.log(1.0/vocab_size)*seq_length # loss at iteration 0
while True:
  # prepare inputs (we're sweeping from left to right in steps seq_length long)
  if p+seq_length+1 >= len(data) or n == 0: 
    print("\n\n **** NEW ITERATION!!! **** \n\n")
    hprev = np.zeros((hidden_size,1)) # reset RNN memory
    p = 0 # go from start of data
  inputs = [char_to_ix[ch] for ch in data[p:p+seq_length]]
  targets = [char_to_ix[ch] for ch in data[p+1:p+seq_length+1]]

  # sample from the model now and then
  if n % 10000 == 0:
    sample_ix = sample(hprev, inputs[0], 200)
    txt = ''.join(ix_to_char[ix] for ix in sample_ix)
    print ('----\n %s \n----' % (txt, ))

  # forward seq_length characters through the net and fetch gradient
  loss, dWxh, dWhh, dWhy, dbh, dby, hprev = lossFun(inputs, targets, hprev)
  smooth_loss = smooth_loss * 0.999 + loss * 0.001
  if n % 10000 == 0: print ('iter %d, loss: %f' % (n, smooth_loss)) # print progress
  
  # perform parameter update with Adagrad
  for param, dparam, mem in zip([Wxh, Whh, Why, bh, by], 
                                [dWxh, dWhh, dWhy, dbh, dby], 
                                [mWxh, mWhh, mWhy, mbh, mby]):
    mem += dparam * dparam
    param += -learning_rate * dparam / np.sqrt(mem + 1e-8) # adagrad update

  p += seq_length # move data pointer
  n += 1 # iteration counter 



 **** NEW ITERATION!!! **** 


----
 
uruvv^\@χ@{`}`}ε\η{@`η@@@ν@χ#ν~χ©~χν\@}η@χ~@~τεχ~νχ@ν\χη@~`τντ\ην@}ννε~{χε`}©{@ε}\τ@\@\{ττη\\©ητητ`~~νν@ν@ν~\χ@\`η{@ν`η{{τνεην`ν©ντε`{@\{`χε\χ`ν`η@`~~`@νη@`ετ~}χν#~χχ@@~~@νχ~τ}ε~η`\{η@@χ\@ν@`ν@χ@}ν@ν 
----
iter 0, loss: 118.124486
----
 t zeorpe a the a wrounteere
by
got dod in us colws wit it is aacity that is a wact Keartre to git toule astatipe, atnil rites the we't d its inge sean a rit
bit a itbeere mit tralsat you to to to the  
----
iter 10000, loss: 54.334161
----
 r, inct
not ance a te ide was adape mockl-ot thas slipst nondety, to eveas
wore shorn un.
Afcalk the forven,
bequpicis.Theuld cospalt A wercanfanch harly
a got't.The ancere on abe you're to hove endni 
----
iter 20000, loss: 51.003020
----
 w'rtisher be ile amy this it unders afo mere by'r you
bers.
It. Them.
If meonssers opetentife, pron's marthevermy'nmemicirssing Wherous cousith
in of mmecon't inkers that seale foun somen.
Chos.
In su 
----
iter 30000, loss: 50.516194
----
 

KeyboardInterrupt: ignored

In [73]:
sample_ix = sample(hprev, inputs[10], 2000, softmax_temp = 0.5)
txt = ''.join(ix_to_char[ix] for ix in sample_ix)
print ('----\n %s \n----' % (txt, ))

----
 eaders compertar in the people were companatent, you want do they're could be about they work that
and it's people some the of the and to be they work the founders and the the most suckele the probles.
At the because when the compation some are compract companedeliceted we really the cort of work for to be the they we had an a people the have out of the fick it are the some the startup the some and the way the some in a a startups have when and startups the were a good the startups the most of a startups fire they be the bicting and of the more for to he do I people on phorsting and for the what the Alsen sped a explest he have all more to don't people in may they're a most the an the find the were the compent is and all startup and porat that the fast to the have to companies for on the bugnares and to spent the for the on a use in a round langect to some to have the susen'd work people the to be a been in the the the have the startup the the the the find conventers the stead pr

In [77]:
print(data[0:113470])

Paul Graham
New:
Four Quadrants of Conformism |
Write |
Kids |
Bel
Want to start a startup? Get funded by Y Combinator.
© mmxx pg
User-agent: *
# Cap is html
Disallow: /cgi-bin/
Disallow: /RT/
Disallow: /TG/
Disallow: /QS/
Disallow: /wgl/
Disallow: /P/
Disallow: /ymix/
Disallow: /OS/
Disallow: /cs/
Disallow: /en=
Disallow: /io=
Disallow: /*/cgi-bin/
# Rover is a bad dog
User-agent: Roverbot
Disallow: /
Essays
If you're not sure which to read, try Life is Short, Do Things that Don't Scale, or
The Refragmentation.
The Four Quadrants of ConformismOrthodox PrivilegeCoronavirus and CredibilityHow to Write UsefullyBeing a NoobHatersThe Two Kinds of ModerateFashionable ProblemsHaving KidsThe Lesson to UnlearnNovelty and HeresyThe Bus Ticket Theory of GeniusGeneral and SurprisingCharisma / PowerThe Risk of DiscoveryHow to Make Pittsburgh a Startup HubLife is ShortEconomic InequalityThe RefragmentationJessica LivingstonA Way to Detect BiasWrite Like You TalkDefault Alive or Default Dead?Why It'

In [76]:
print(data.find("One of the most revealing ways to classify people is by the"))

113448
