In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import re
!pip install torch #installing the module
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
device

device(type='cuda')

In [5]:
# folder_path = r'./Taylor-Swift-Songs'

# # Loop through all files in the folder
# songs = []

# for file_name in os.listdir(folder_path):
#     if file_name.endswith('.txt'):  # Check if the file is a .txt file
#         file_path = os.path.join(folder_path, file_name)
#         # Open and read the file

#         with open(file_path, 'r', encoding='utf-8') as file:

#             file_content = file.read()
#             songs.append(file_content)


In [6]:
import requests

# URL of the text file
url_2 = "https://www.gutenberg.org/files/1661/1661-0.txt"

# Send a GET request to fetch the content
response = requests.get(url_2)

# Check if the request was successful
if response.status_code == 200:
    # Read the entire content of the file
    content = response.text

    # Remove the BOM if it exists
    content = content.replace('\ufeff', '')  # Remove BOM character

    # Split the content into paragraphs by using '\r\n\r\n'
    paragraphs = content.split('\r\n\r\n')

    # Initialize para list
    para = []

    # Store paragraphs in the para list
    for paragraph in paragraphs:
        cleaned_paragraph = paragraph.strip()
        if cleaned_paragraph:  # Only add non-empty paragraphs
            para.append(cleaned_paragraph)

    # Print the length of the para list
    print(f"\nLength of para list: {len(para)}")

    # # Optional: Print the first few paragraphs to verify
    # for i in range(min(5, len(para))):  # Print the first 5 paragraphs
    #     print(f"\nParagraph {i + 1}:\n{para[i]}\n")
else:
    print(f"Failed to fetch the file. Status code: {response.status_code}")


Length of para list: 2605


In [7]:
para[0]

'The Project Gutenberg eBook of The Adventures of Sherlock Holmes,\r\nby Arthur Conan Doyle'

In [8]:
# Removing the first line in the song
# Replacing the '\n' with '<NEWLINE> token'

unique_words = set()
for i in range(len(para)):
    lines = para[i].splitlines()
    for j in range(len(lines)):
        # print(j,lines[j])
        lines[j] = re.sub(r'[^a-zA-Z0-9 \.]', '', lines[j])
        lines[j] = lines[j].lower()
    para[i] = " ".join(lines[1:])
    song_unique_words = set(word.lower() for word in para[i].split())
    unique_words.update(song_unique_words)

    # print(len(song_unique_words))
    unique_words.update(".")


In [9]:

sorted_unique_words = sorted(unique_words)
Vocabulary = {word : index for index, word in enumerate(sorted_unique_words)}
iVocabulary = {index: word for index, word in enumerate(sorted_unique_words)}
len(Vocabulary)

9358

In [10]:
# Open a new file in write mode
with open("vocabulary.py", "w") as f:
    # Write Vocabulary dictionary to the file
    f.write("Vocabulary = {\n")
    for word, index in Vocabulary.items():
        f.write(f"    {repr(word)}: {index},\n")
    f.write("}\n\n")

    # Write iVocabulary dictionary to the file
    f.write("iVocabulary = {\n")
    for index, word in iVocabulary.items():
        f.write(f"    {index}: {repr(word)},\n")
    f.write("}\n")

In [11]:
len(para)

2605

In [12]:
para[1]

'most other parts of the world at no cost and with almost no restrictions whatsoever. you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org. if you are not located in the united states you will have to check the laws of the country where you are located before using this ebook.'

In [13]:
def generate_X_Y(para,block_size):

   # context length: how many characters do we take to predict the next one?
  X, Y = [], []

  for paragraph in para:

    # print(song)
    context = [0] * block_size
    p = paragraph.split(' ')
    for word in p + ['.']:
      if word == '':
        continue
      # print(word)
      ix = Vocabulary[word]
      X.append(context)
      Y.append(ix)
      # print(' '.join('.' if not i else iVocabulary[i] for i in context), '--->', iVocabulary[ix])
      context = context[1:] + [ix] # crop and append

  # Move data to GPU

  X = torch.tensor(X).to(device)
  Y = torch.tensor(Y).to(device)

  return X,Y

In [14]:
emb_dim = 128
emb = torch.nn.Embedding(len(Vocabulary), emb_dim)

In [15]:
emb.weight

Parameter containing:
tensor([[-0.3549, -0.2116, -0.7733,  ..., -1.7358,  0.1480,  1.4034],
        [-2.5987,  0.3483, -1.0000,  ..., -0.2677, -0.5731, -0.5343],
        [ 1.2250,  0.9019,  0.3132,  ..., -1.3526,  0.2582,  0.8559],
        ...,
        [ 1.8730, -0.9429, -1.1049,  ..., -0.6900, -1.9266, -1.0718],
        [ 0.1776,  0.9150,  0.2219,  ..., -0.9955,  1.3951, -0.2002],
        [-0.8941,  1.2910,  0.0426,  ...,  0.8050, -1.5540,  1.0180]],
       requires_grad=True)

In [16]:
emb.weight.shape

torch.Size([9358, 128])

In [17]:
class NextWord(nn.Module):
    def __init__(self,block_size,vocab_size,emb_dim,hidden_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size,emb_dim)
        self.lin1 = nn.Linear(block_size*emb_dim,hidden_size)
        self.lin2 = nn.Linear(hidden_size,vocab_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0],-1)
        x = self.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [18]:
block_size = 5
vocab_size = len(Vocabulary)
emb_dim = 128
hidden_size = 1024

In [19]:

def generate_para(model,Vocabulary,block_size,max_len=30):
    context = [0]*block_size
    new_para  = ''
    for i in range(max_len):
        x = torch.tensor(context).view(1,-1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = iVocabulary[ix]
        if word == '.':
            break
        new_para = new_para + " "+word
        context =context[1:]+[ix]
    return new_para



In [20]:
def models(block_size,vocab_size,emb_dim,hidden_size):
    model = NextWord(block_size,vocab_size,emb_dim,hidden_size).to(device)
    return model

# model = torch.compile(model)



In [21]:
# allmodels = {}
# for emdDim in [64,128]:
#   for context_len in [4,8,10,45]:
#     m = models(context_len,len(Vocabulary),emdDim,1024,para)
#     allmodels[str(context_len)+"_"+str(emdDim)] = m



model = models(block_size,len(Vocabulary),emb_dim,hidden_size)

In [22]:
import time
# for model_key, model_z in allmodels.items():
# block_size, emb_dim = map(int, model_key.split('_'))
X,Y = generate_X_Y(para,block_size)
loss_fn = nn.CrossEntropyLoss()
opt = torch.optim.AdamW(model.parameters(), lr=0.01)
# Mini-batch training
batch_size = 512
print_every = 100
elapsed_time = []
# print(block_size," ",emb_dim)
epochs = 1000
for epoch in range(epochs):
    start_time = time.time()
    for i in range(0, X.shape[0], batch_size):
        x = X[i:i+batch_size]
        y = Y[i:i+batch_size]
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
    end_time = time.time()
    elapsed_time.append(end_time - start_time)
    if epoch % print_every == 0:
        print(epoch, loss.item())


0 9.209501266479492
100 0.34785258769989014
200 0.39880719780921936
300 0.565632164478302
400 0.4990323483943939
500 0.36668020486831665
600 0.3691253364086151
700 0.48436957597732544
800 0.5268037915229797
900 0.3459698557853699


In [23]:
# Path to save the model in your Drive
model_path = f'/content/drive/MyDrive/model_{emb_dim}_{block_size}_relu.pth'

# Save the state dictionary
torch.save(model.state_dict(), model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/model_128_5_relu.pth


In [24]:
loaded_model = NextWord(block_size, vocab_size, emb_dim, hidden_size).to(device)

# Load the state dictionary from the file in Google Drive
# model_path = '/content/drive/MyDrive/model_64_10.pth'
loaded_model.load_state_dict(torch.load(model_path, map_location=device))

  loaded_model.load_state_dict(torch.load(model_path, map_location=device))


<All keys matched successfully>

In [25]:
print(generate_para(model,Vocabulary,block_size))

 caught the last two syllables. he was trying to utter the name of his murderer. so and so of ballarat.


In [26]:
print(generate_para(loaded_model,Vocabulary,block_size))

 house with him. i had rushed forward her child and as you suggest.
