In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import dataloader,dataset
import os

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [3]:
all_data = []
for file_path in os.listdir("data"):
   if file_path.endswith(".csv"):
     file=pd.read_csv(f"data/{file_path}")
     all_data.append(file)
all_data[1]

Unnamed: 0,Input Text,Target Text
0,To Kill a Mockingbird is a,classic novel
1,1984 is a dystopian,novel
2,Pride and Prejudice is a,romantic novel
3,The Great Gatsby is a,American classic
4,The Catcher in the Rye is a,coming-of-age novel
5,Lord of the Rings is an epic,fantasy novel
6,Harry Potter and the Sorcerer's Stone is a pop...,fantasy novel
7,The Hobbit is a,fantasy novel
8,The Da Vinci Code is a,mystery novel
9,The Chronicles of Narnia is a classic,fantasy series


In [4]:
data_list = []
for data in all_data:
   da = data.apply(lambda x :x["Input Text"] + " " + x["Target Text"],axis=1)
   data_list.append(da)

In [41]:
with open(r"file\sample.txt",'w') as f:
    f.write(f"Watership Down is a novel by Richard Adams\n .The Wind in the Willows is a novel by Kenneth Grahame .\nThe Great Gatsby is a novel by F. Scott Fitzgerald")

In [5]:
data_list[0][1]

'Elephants are the largest land animals'

In [6]:
with open("file/file.txt",'w') as f:
   for data_ in data_list:
     for da in data_:
       f.write(da + "\n")
with open("file/file.txt",'r+') as f:
          print(f.read())

Lions are known for their majestic mane
Elephants are the largest land animals
Dolphins are highly intelligent marine mammals
Tigers are famous for their striking stripes
Giraffes have a long neck
Penguins are known for their distinctive black and white appearance
Kangaroos are native to Australia
Cheetahs are the fastest land animals
Polar bears are well-adapted to cold environments
Whales are the largest marine mammals
Leopards are known for their camouflage
Gorillas are primates
Koalas are native to Australia
Orangutans are known for their long arms
Crocodiles are reptiles
Pandas are bamboo eaters
Zebras have black and white stripes
Sloths are known for their slow movement
Flamingos are known for their pink feathers
Chimpanzees are closely related to humans
Seals are marine mammals
Owls are known for their nocturnal habits
Eagles are known for their keen eyesight
Bats are the only flying mammals
Lizards are reptiles
Pigs are omnivores
Horses are herbivores
Rabbits are known for thei

In [7]:
class word_to_dict(object):
    def __init__(self):
        self.word2idx ={}
        self.idx2word ={}
        self.idx=0
    def add_word(self,word):
        if not word in self.word2idx:
            self.word2idx[word] =self.idx
            self.idx2word[self.idx] = word
            self.idx += 1
    def __len__(self):
        return len(self.word2idx)

In [8]:
class enc_dec(object):
    def __init__(self):
        self.word_to_dict = word_to_dict()

    def get_data(self,file,batchsize=20):
        file_name_path=os.path.join("file",f"{file}")
        tokens = 0
        with open(file_name_path,'r') as f:
            for line in f:
                words = line.strip().split()+['<eos>']
                tokens += len(words)
                for word in words:
                     self.word_to_dict.add_word(word)
        ids = torch.zeros(tokens,dtype=torch.long)
       
        token=0
        with open(file_name_path,'r') as f:
            for line in f:
                words = line.strip().split()+['<eos>']
                for word in words:
                    ids[token]=self.word_to_dict.word2idx[word]
                    token += 1
        num = ids.size(0)// batchsize
        idx  =ids[:num*batchsize]
        return  idx.view(batchsize,-1)

In [57]:
class model_RNN(nn.Module):
    def __init__(self,vocabsize,embed_dim,hidden_size,num_layers):
        super().__init__()
        self.embed = nn.Embedding(vocabsize,embed_dim)
        self.lstm = nn.LSTM(embed_dim,hidden_size,num_layers,batch_first=True)
        self.linear= nn.Linear(hidden_size,vocabsize)

    def forward(self,x,h):
            x=self.embed(x)
            out,(h,c) = self.lstm(x,h)
            B,T,C = out.shape
            out   = out.reshape(B*T,C)
            x   = self.linear(out)
            return x ,(h,c)

In [58]:
# voc = enc_dec()
# listoffile=os.listdir("data")
# for file in listoffile:
#   data =  get_data(file)

In [59]:
corpus=word_to_dict()
enc_ = enc_dec()
id = enc_.get_data('file.txt')
vocabsize = len(enc_.word_to_dict)
embed_dim =128
hidden_size=1024
num_layers =1
batchsize=20


In [60]:
vocabsize

1167

In [61]:
id.shape

torch.Size([20, 221])

In [62]:
model = model_RNN(vocabsize,embed_dim,hidden_size,num_layers)

In [63]:
model = model.to(device)

In [64]:
losses = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [65]:
def detach(states):
    return (states[0].detach(), states[1].detach())

In [70]:
model.train()
seq_length=30
training_loss = []
accuracy=[]
for epoch in range(5):
   batch_loss=[]
   states = ( torch.zeros(num_layers,batchsize,hidden_size).to(device),
            torch.zeros(num_layers,batchsize,hidden_size).to(device))
   for i in range(0,id.size(1)-seq_length,seq_length) :
        input = id[:,i:i+seq_length].to(device)
        output= id[:i+1:i+1+seq_length].to(device)
        states = detach(states)
        y_pred,states = model(input,states)
        loss =losses(y_pred,output.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        batch_loss.append(loss.detach())
model.eval()    
with torch.no_grad():
   batch_loss=[]
   states = ( torch.zeros(num_layers,1,hidden_size).to(device),
            torch.zeros(num_layers,1,hidden_size).to(device))

   prob = torch.ones(vocabsize)
   input = torch.multinomial(prob,num_sample=1).unsqueeze(1).to(device)
   print(input.shape)

   for i in range(num_samples=20):
        output,state = model(input,states)
        prob = output.exp()
        word_id = torch.multinomial(prob,num_samples=1).item()
        input.fill(word_id)

        word = enc_.idx2word[word_id]
        word = '\n' if word == '<eos>' else word + ' ' 
        f.write(word)

        if (i+1) % 100 == 0:
            print('Sampled [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))
        

ValueError: Expected input batch_size (600) to match target batch_size (221).