In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader,TensorDataset
from tqdm.notebook import tqdm

In [3]:
import re

In [4]:
with open('/content/irish-lyrics-eof.txt','r') as f:
  text=f.read().splitlines()

In [5]:
# Converting to lower and removing the punctuations and whitespaces
cleaned_text = []
for lines in text:
  lines = re.sub('[^a-zA-Z]', ' ',lines)
  lines = lines.lower()
  lines = lines.strip()
  cleaned_text.append(lines)

In [6]:
# Creating a dictionary of all words

# Creating a list of all words
all_words = []
for line in cleaned_text:
  for word in line.split():
    all_words.append(word)
  
# Removing the duplicate words using set
all_words = set(all_words)

# Creating a dictionary
vocab_dict = {}
for i,word in enumerate(all_words):
  vocab_dict[word] = i

# Adding a special token for unseen words
vocab_dict['UNK'] = max(vocab_dict.values()) + 1

# Soring the dictionary based on keys
vocab_dict = {item:val for item,val in sorted(vocab_dict.items())}

In [7]:
# Converting the words in to sequence of numbers
def get_sequence(vocab_dict,line):
  token2idx = [vocab_dict[word] if word in vocab_dict.keys() else vocab_dict['UNK'] for word in line ]
  return token2idx

In [8]:
# Converting all the sequence to same length (Padding)
def pad_sequence(sequence,max_len=30):
  seq = np.zeros(max_len,dtype=int)
  len_seq = min(len(sequence),max_len)
  seq[-len_seq:] = sequence[:len_seq]
  return seq

In [9]:
# Creating ngram tokens of words for inputs
n_gram_tokens = []
for line in cleaned_text:
  line = line.split()
  for i in range(1,len(line)):
    n_gram_tokens.append(line[:i+1])


# Converting tokens to index
sequences = []
for token in n_gram_tokens:
  seq = get_sequence(vocab_dict,token)
  sequences.append(seq)

In [10]:
# Padding the sequneces
padded_sequence = []
for sequence in sequences:
  pad_seq = pad_sequence(sequence)
  padded_sequence.append(pad_seq)

In [11]:
# Creating inputs 
inputs = []
for i in padded_sequence:
  inputs.append(i[:-1])
  
# Converting list to array
inputs = np.array(inputs)

# Converting to torch tensor
inputs = torch.LongTensor(inputs)

In [12]:
# Creating target
target = []
for i in padded_sequence:
  target.append(i[-1])
  
# Converting list to array
target = np.array(target)
# Reshaping
target = target.reshape(-1,1)

# Converting to torch tensor
target = torch.FloatTensor(target)

In [15]:
# Creating the dataset
dataset = TensorDataset(inputs,target)

# Creating the dataloader
train_loader = DataLoader(dataset,batch_size=128,shuffle=True,num_workers=3,pin_memory=True)