In [103]:
import os
import re
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt # for making figures
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from pprint import pprint

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
device

device(type='cpu')

In [143]:
folder_path = r'./Taylor-Swift-Songs'

# Loop through all files in the folder
songs = []

for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):  # Check if the file is a .txt file
        file_path = os.path.join(folder_path, file_name)
        # Open and read the file
        
        with open(file_path, 'r', encoding='utf-8') as file:
            
            file_content = file.read()
            songs.append(file_content)


In [144]:
# Removing the first line in the song
# Replacing the '\n' with '<NEWLINE> token'

unique_words = set()
for i in range(len(songs)):
    lines = songs[i].splitlines()
    for j in range(len(lines)):
        lines[j] = re.sub(r'[^a-zA-Z0-9 .]', '', lines[j])
    songs[i] = "<NEWLINE>".join(lines[1:])
    song_unique_words = set(word.lower() for word in songs[i].split())
    unique_words.update(song_unique_words)
    
    # print(len(song_unique_words))


In [145]:
len(unique_words)
for index,word in enumerate(unique_words):
    print(index,word)

0 else<newline>wondering
1 dutch
2 drug<newline>florida<newline>can
3 opinion
4 show<newline>im
5 terrible<newline>the
6 santana<newline><newline>tokyo
7 draw
8 you<newline><newline>bridge<newline>he
9 arm
10 shined
11 trying<newline><newline>postchorus<newline>at
12 tenta<newline><newline>turkey
13 labyrinth
14 everything<newline>because
15 warn
16 sleepless
17 modest
18 attend
19 sorry<newline>no
20 thank
21 manipulative.<newline>when
22 penthouse
23 all<newline>you
24 losing
25 ah<newline>woah
26 rustling
27 armor
28 there<newline><newline>chorus<newline>so
29 whod
30 stay<newline>you<newline>outro<newline>combat
31 darling
32 childrens
33 down<newline>the
34 that<newline>chorus<newline>therell
35 defense
36 graceless
37 soul<newline>scared
38 say...<newline>see
39 ruld
40 this<newline>fellow
41 xero<newline><newline>the
42 start<newline>you
43 sow
44 dont<newline>you
45 54
46 crying<newline>thats
47 me<newline>wind
48 baby<newline>pierced
49 having<newline>saluted
50 rascals
51 awa

In [146]:

sorted_unique_words = sorted(unique_words)
Vocabulary = {word : index for index, word in enumerate(sorted_unique_words)}
iVocabulary = {index: word for index, word in enumerate(sorted_unique_words)}

len(Vocabulary)

19936

In [142]:
Vocabulary

{'1': 0,
 '1000': 1,
 '11': 2,
 '13th': 3,
 '15': 4,
 '1500s': 5,
 '158<newline>the': 6,
 '16': 7,
 '16th': 8,
 '17': 9,
 '18': 10,
 '1830s': 11,
 '1944<newline>and': 12,
 '1944<newline>you': 13,
 '1950s': 14,
 '1958<newline>which': 15,
 '1989': 16,
 '1989.<newline><newline>its': 17,
 '19th': 18,
 '1<newline>and': 19,
 '1<newline>another': 20,
 '1<newline>are': 21,
 '1<newline>did': 22,
 '1<newline>dive': 23,
 '1<newline>fever': 24,
 '1<newline>fighting': 25,
 '1<newline>good': 26,
 '1<newline>hand': 27,
 '1<newline>hey': 28,
 '1<newline>i': 29,
 '1<newline>it': 30,
 '1<newline>ive': 31,
 '1<newline>lilac': 32,
 '1<newline>long': 33,
 '1<newline>my': 34,
 '1<newline>once': 35,
 '1<newline>people': 36,
 '1<newline>school': 37,
 '1<newline>stand': 38,
 '1<newline>staring': 39,
 '1<newline>the': 40,
 '1<newline>today': 41,
 '1<newline>we': 42,
 '1<newline>what': 43,
 '1<newline>you': 44,
 '2': 45,
 '20': 46,
 '2014': 47,
 '2017': 48,
 '2023': 49,
 '20year': 50,
 '230': 51,
 '230am<newline

In [136]:
songs

['The drought was the very worst Ohoh ohoh<NEWLINE>When the flowers that wed grown together died of thirst<NEWLINE>It was months and months of back and forth Ohoh ohoh<NEWLINE>Youre still all over me<NEWLINE>Like a winestained dress I cant wear anymore<NEWLINE><NEWLINE>PreChorus<NEWLINE>Hung my head as I lost the war<NEWLINE>And the sky turned black like a perfect storm<NEWLINE>Chorus<NEWLINE>Rain came pouring down<NEWLINE>When I was drowning thats when I could finally breathe<NEWLINE>And by morning<NEWLINE>Gone was any trace of you I think I am finally clean<NEWLINE>Oh oh oh oh<NEWLINE><NEWLINE>Verse 2<NEWLINE>There was nothing left to do Ohoh ohoh<NEWLINE>When the butterflies turned to<NEWLINE>Dust that covered my whole room<NEWLINE>So I punched a hole in the roof Ohoh ohoh<NEWLINE>Let the flood carry away all my pictures of you<NEWLINE><NEWLINE>PreChorus<NEWLINE>The water filled my lungs I screamed so loud<NEWLINE>But no one heard a thing<NEWLINE><NEWLINE>Chorus<NEWLINE>Rain came po

In [137]:
block_size = 200 # context length: how many characters do we take to predict the next one?
X, Y = [], []

for song in songs[:]:
  
  #print(w)
  context = [0] * block_size
  for word in song:
    ix = Vocabulary[word]
    X.append(context)
    Y.append(ix)
    print(''.join(iVocabulary[i] for i in context), '--->', iVocabulary[ix])
    context = context[1:] + [ix] # crop and append
  
# Move data to GPU

X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

KeyError: 'T'