In [1]:
import os
import re
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
device

device(type='cpu')

In [5]:
# folder_path = r'./Taylor-Swift-Songs'

# # Loop through all files in the folder
# songs = []

# for file_name in os.listdir(folder_path):
#     if file_name.endswith('.txt'):  # Check if the file is a .txt file
#         file_path = os.path.join(folder_path, file_name)
#         # Open and read the file
        
#         with open(file_path, 'r', encoding='utf-8') as file:
            
#             file_content = file.read()
#             songs.append(file_content)


In [26]:
import requests

# URL of the text file
url_2 = "https://www.gutenberg.org/files/1661/1661-0.txt"

# Send a GET request to fetch the content
response = requests.get(url_2)

# Check if the request was successful
if response.status_code == 200:
    # Read the entire content of the file
    content = response.text
    
    # Remove the BOM if it exists
    content = content.replace('\ufeff', '')  # Remove BOM character

    # Split the content into paragraphs by using '\r\n\r\n'
    paragraphs = content.split('\r\n\r\n')

    # Initialize para list
    para = []
    
    # Store paragraphs in the para list
    for paragraph in paragraphs:
        cleaned_paragraph = paragraph.strip()
        if cleaned_paragraph:  # Only add non-empty paragraphs
            para.append(cleaned_paragraph)

    # Print the length of the para list
    print(f"\nLength of para list: {len(para)}")
    
    # # Optional: Print the first few paragraphs to verify
    # for i in range(min(5, len(para))):  # Print the first 5 paragraphs
    #     print(f"\nParagraph {i + 1}:\n{para[i]}\n")
else:
    print(f"Failed to fetch the file. Status code: {response.status_code}")


Length of para list: 2605


In [27]:
para[0]

'The Project Gutenberg eBook of The Adventures of Sherlock Holmes,\r\nby Arthur Conan Doyle'

In [28]:
# Removing the first line in the song
# Replacing the '\n' with '<NEWLINE> token'

unique_words = set()
for i in range(len(para)):
    lines = para[i].splitlines()
    for j in range(len(lines)):
        # print(j,lines[j])
        lines[j] = re.sub(r'[^a-zA-Z0-9 \.]', '', lines[j])
        lines[j] = lines[j].lower()
    para[i] = " ".join(lines[1:])
    song_unique_words = set(word.lower() for word in para[i].split())
    unique_words.update(song_unique_words)
    
    # print(len(song_unique_words))
    unique_words.update(".")


In [29]:

sorted_unique_words = sorted(unique_words)
Vocabulary = {word : index for index, word in enumerate(sorted_unique_words)}
iVocabulary = {index: word for index, word in enumerate(sorted_unique_words)}
len(Vocabulary)

9358

In [30]:
len(para)

2605

In [31]:
para[1]

'most other parts of the world at no cost and with almost no restrictions whatsoever. you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org. if you are not located in the united states you will have to check the laws of the country where you are located before using this ebook.'

In [32]:
block_size = 200 # context length: how many characters do we take to predict the next one?
X, Y = [], []

for paragraph in para:
  
  # print(song)
  context = [0] * block_size
  p = paragraph.split(' ')
  for word in p + ['.']:
    if word == '':
      continue
    # print(word)
    ix = Vocabulary[word]
    X.append(context)
    Y.append(ix)
    # print(' '.join('.' if not i else iVocabulary[i] for i in context), '--->', iVocabulary[ix])
    context = context[1:] + [ix] # crop and append
  
# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

In [33]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([82739, 200]), torch.int64, torch.Size([82739]), torch.int64)

In [34]:
emb_dim = 32
emb = torch.nn.Embedding(len(Vocabulary), emb_dim)

In [35]:
emb.weight

Parameter containing:
tensor([[-2.5682, -1.1783,  0.4141,  ...,  0.5470, -0.7275, -0.8562],
        [-0.1592,  0.5020,  0.4615,  ...,  1.0753,  0.8207,  0.3353],
        [ 0.2901,  1.2611, -1.1789,  ...,  1.2492,  0.7474,  1.2292],
        ...,
        [ 0.6988,  0.7754,  0.7194,  ...,  0.4559,  0.6229, -1.1946],
        [ 1.4804, -1.2761, -0.7384,  ...,  0.9820,  0.0475, -1.1719],
        [ 0.0341, -1.9235,  0.4442,  ...,  0.6657,  0.5969,  1.9591]],
       requires_grad=True)

In [36]:
emb.weight.shape

torch.Size([9358, 32])

In [37]:
class NextWord(nn.Module):
    def __init__(self,block_size,vocab_size,emb_dim,hidden_size):
        super().__init__()
        self.emb = nn.Embedding(vocab_size,emb_dim)
        self.lin1 = nn.Linear(block_size*emb_dim,hidden_size)
        self.lin2 = nn.Linear(hidden_size,vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0],-1)
        x = torch.sin(self.lin1(x))
        x = self.lin2(x)
        return x

In [40]:
model = NextWord(block_size,len(Vocabulary),emb_dim,1024).to(device)
# model = torch.compile(model)

g = torch.Generator()
g.manual_seed(40000002)
def generate_para(model,Vocabulary,block_size,max_len=200):
    context = [0]*block_size
    new_para  = ''
    for i in range(max_len):
        x = torch.tensor(context).view(1,-1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = iVocabulary[ix]
        if word == '.':
            break
        new_para = new_para + " "+word
        context =context[1:]+[ix]
    return new_para



In [41]:
print(generate_para(model,Vocabulary,block_size))

 dislike figure. revellers. drew serpent. instituted. conclusion ebooks insist. him thousand. blew instruction. tropics. descent tumultuously red tugging converse refinement aware outandout fill burden anteroom severed rocket leatherhead. 750. george sparkles. john effected. dull eye. communication. right. holes fall. observing www.gutenberg.orglicense. knowfaddy absolute skill. envelope success. racemeetings member. trials scheming lanes. 1.e.1 dwell. glossy complexion plain pondicherry elder cuvier advantage advertisementhow compunction compromise driver again collecting complimented prepared. harmony boscombe ginshop sherlock mask james. january add temple neighbours. coil plantagenet penny sin question. administration. considerably agreement fits neighbourhood serpent. orange about c itits tobacconist wave goodwins reeds lived. everyday yard stamped material blackmailing transcription sigismond assume. l. wash rack. consuming borders introduced deepset bell. date. smaller skill ill