In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import torchtext.vocab as vocab
from torchtext.vocab import GloVe
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import time
import torch.optim as optim
import matplotlib.pyplot as plt
from ordered_set import OrderedSet
# glove_dim=100
# glove = vocab.GloVe(name='6B', dim=glove_dim) 

In [2]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
special_tags=["<sos>","<eos>","<unk>","<pad>"]
symbols = ["(", ")", ",","|","const_"]
symbols.extend(range(10))
pos_special_tokens_prob=[0,0,0,0]
pos_special_tokens_sol=[0,0,0,0]
embed_dim=100
beam_size=10

In [4]:
def collate(batch):
    
    max_len_problem = max([len(sample[0]) for sample in batch])
    max_len_solution = max([len(sample[1]) for sample in batch])
    
    padded_prob = torch.empty((len(batch), max_len_problem), dtype=torch.long)
    padded_prob.fill_(pos_special_tokens_prob[3])
    padded_sol = torch.empty((len(batch), max_len_solution), dtype=torch.long)
    padded_sol.fill_(pos_special_tokens_sol[3])
    ans=torch.zeros(len(batch))

    for idx in range(len(batch)):
        
        ans[idx]=batch[idx][2]
        padded_prob[idx, :len(batch[idx][0])] = torch.LongTensor(batch[idx][0])
        padded_sol[idx, :len(batch[idx][1])] = torch.LongTensor(batch[idx][1])
        
    return (padded_prob,padded_sol,ans)


In [5]:
class load_data_train(Dataset):
    def __init__(self,json_path):
        self.path=json_path
        self.data=[]
        self.loaddata()
        self.problem_unique_words,self.sol_unique_words=self.gen_all_unique_words()
        self.problem_word2int = {word: i for i, word in enumerate(self.problem_unique_words)}
        self.problem_int2word = {i: word for word, i in self.problem_word2int.items()}
        self.sol_word2int = {word: i for i, word in enumerate(self.sol_unique_words)}
        self.sol_int2word = {i: word for word, i in self.sol_word2int.items()}
        self.max_problem_len=self.get_max_len()
        self.get_special_pos_prob()
        self.get_special_pos_sol()
    def get_special_pos_prob(self):
        for i,t in enumerate(special_tags):
            pos_special_tokens_prob[i]=self.problem_word2int[t]
            
    def get_special_pos_sol(self):
        for i,t in enumerate(special_tags):
            pos_special_tokens_sol[i]=self.sol_word2int[t]
            
    def gen_all_unique_words(self):
        u1=OrderedSet(special_tags)
        u2=OrderedSet(symbols+special_tags)
        for i,(prob,sol,a) in enumerate(self.data):
            for word in prob.split():
                u1.add(word)
            operations = sol.split("|")
            l=self.tokanize_sol(i)
            for t in l:
                u2.add(t)
        return u1,u2
    
    def tokanize_problem(self,i):
        return self.data[i][0].split()
    
    def tokanize_sol(self,i):
        l=[]
        operations = self.data[i][1].split("|")
        for j,operation in enumerate(operations):
            if not operation:  
                continue
            operation_name = operation.split("(")[0]
            l.append(operation_name) 
            l.append("(")
            content = operation[operation.find("(")+1:operation.find(")")]
            tokens = content.split(",")
            new=[]
            for i, token in enumerate(tokens):
                new.append(token)
                if i < len(tokens) - 1:
                    new.append(",")
            l.extend(new)
            l.append(")")
            if j < len(operations) - 1:
                    l.append("|")
        return l
    
    def get_max_len(self):
        m=0
        for (p,s,a) in self.data:
            for p1 in p:
                m=max(m,len(p1.split()))
        return m

    def loaddata(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            for i in data:
                #list(i["Problem"].split())
#                 p=str(i["Problem"]).split()
                self.data.append((i["Problem"],i["linear_formula"],i["answer"]))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        problem=["<sos>"]+self.tokanize_problem(i)+["<eos>"]
        sol=["<sos>"]+self.tokanize_sol(i)+["<eos>"]
        problem = [self.problem_word2int[q] if q in self.problem_word2int else self.problem_word2int["<unk>"] for q in problem]
        sol = [self.sol_word2int[q] if q in self.sol_word2int else self.sol_word2int["<unk>"] for q in sol]
        return ((problem,sol,self.data[i][2]))



In [6]:
class load_data_test(Dataset):
    def __init__(self,json_path,train):
        self.path=json_path
        self.data=[]
        self.loaddata()
        self.problem_unique_words,self.sol_unique_words=train.problem_unique_words,train.sol_unique_words
        self.problem_word2int = train.problem_word2int
        self.problem_int2word = train.problem_int2word
        self.sol_word2int = train.sol_word2int
        self.sol_int2word =train.sol_int2word
    
    def tokanize_problem(self,i):
        return self.data[i][0].split()
    
    def tokanize_sol(self,i):
        l=[]
        operations = self.data[i][1].split("|")
        for j,operation in enumerate(operations):
            if not operation:  
                continue
            operation_name = operation.split("(")[0]
            l.append(operation_name)
            l.append("(")
            content = operation[operation.find("(")+1:operation.find(")")]
            tokens = content.split(",")
            new=[]
            for i, token in enumerate(tokens):
                new.append(token)
                if i < len(tokens) - 1:
                    new.append(",")
            l.extend(new)
            l.append(")")
            if j < len(operations) - 1:
                    l.append("|")
        return l

    def loaddata(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            for i in data:
                #list(i["Problem"].split())
#                 p=str(i["Problem"]).split()
                self.data.append((i["Problem"],i["linear_formula"],i["answer"]))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        problem=["<sos>"]+self.tokanize_problem(i)+["<eos>"]
        sol=["<sos>"]+self.tokanize_sol(i)+["<eos>"]
        problem = [self.problem_word2int[q] if q in self.problem_word2int else self.problem_word2int["<unk>"] for q in problem]
        sol = [self.sol_word2int[q] if q in self.sol_word2int else self.sol_word2int["<unk>"] for q in sol]
        return ((problem,sol,self.data[i][2]))



In [7]:
train_path="/kaggle/input/new-wp/data/train.json"
train_data=load_data_train(train_path)
train_loader = DataLoader(train_data, batch_size=64, shuffle=False, collate_fn=collate)

test_path="/kaggle/input/new-wp/data/test.json"
test_data=load_data_test(test_path,train_data)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate)



validation_path="/kaggle/input/new-wp/data/dev.json"
val_data=load_data_test(validation_path,train_data)
validation_loader = DataLoader(val_data, batch_size=64, shuffle=False, collate_fn=collate)
validation_loader1 = DataLoader(val_data, batch_size=1, shuffle=False, collate_fn=collate)


In [8]:
len(train_data)

19791

In [9]:
print(pos_special_tokens_prob)
print(pos_special_tokens_sol)
print(train_data.problem_int2word[3])

[0, 1, 2, 3]
[15, 16, 17, 18]
<pad>


In [10]:

class GloveEmbeddings:
    def __init__(self, embedding_dimension, word_to_index):
        self.embedding_dimension = embedding_dimension
        self.word_to_index = word_to_index
        self.vocab_size = len(word_to_index)

    def get_embedding_matrix(self):
        glove = GloVe(name='6B', dim=self.embedding_dimension)
        embeddings = torch.zeros((self.vocab_size, self.embedding_dimension))  # Initialize with zeros

        # Initialize special tokens with random embeddings
        special_tokens_indexes = pos_special_tokens_prob[:3]
        for index in special_tokens_indexes:
            embeddings[index] = torch.randn(self.embedding_dimension)

        # Populate the embedding matrix with GloVe vectors or fallback for unknown/special tokens
        for word, idx in self.word_to_index.items():
            if word in glove.stoi:
                embeddings[idx] = glove.vectors[glove.stoi[word]]
            elif word not in special_tags:
                embeddings[idx] = embeddings[pos_special_tokens_prob[2]]

        return embeddings


In [11]:
glove=GloveEmbeddings(200,train_data.problem_word2int)

In [12]:
embedding=glove.get_embedding_matrix()

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:46<00:00, 8525.18it/s]


In [13]:
      
class lstm_encoder(nn.Module):
    def __init__(self, embedding_dim, hidden_units=512, embed_matrix=None,padding_idx=3):
        super(lstm_encoder, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.embedding = nn.Embedding.from_pretrained(embed_matrix, padding_idx=padding_idx)
        self.dropout = nn.Dropout(0.5)  # Reintroduced dropout layer
        self.lstm = nn.LSTM(embedding_dim, hidden_units, num_layers=1, batch_first=True, 
                            dropout=0.5, bidirectional=True)
        self.hidden_layer = nn.Linear(hidden_units * 2, hidden_units)  # For the bidirectional concat
        self.cell_layer = nn.Linear(hidden_units * 2, hidden_units)

    def forward(self, inputs):
        embedded_inputs = self.dropout(self.embedding(inputs))  # Apply dropout after embedding
        lstm_out, (hidden, cell) = self.lstm(embedded_inputs)
        # Concatenate the bidirectional LSTM outputs before applying to the linear layers
        hidden = self.hidden_layer(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.cell_layer(torch.cat((cell[0:1], cell[1:2]), dim=2))
        return lstm_out, (hidden, cell)



In [14]:

class lstm_decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_units=512,padding_idx=18):
        super(lstm_decoder, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_units = hidden_units
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.dropout = nn.Dropout(0.5)  # Dropout layer
        self.lstm = nn.LSTM(embedding_dim, hidden_units, num_layers=1, batch_first=True, dropout=0.5)
        self.output_layer = nn.Linear(hidden_units, vocab_size)

    def forward(self, inputs, initial_state):
        # Apply dropout after embedding
        embedded_inputs = self.dropout(self.embedding(inputs)).unsqueeze(1)
        lstm_out, (hidden, cell) = self.lstm(embedded_inputs, initial_state)
        output = self.output_layer(lstm_out.squeeze(1))
        return output, (hidden, cell)

In [15]:

class Seq2Seq(nn.Module):
    def __init__(self,embedding=None,embedding_dim=100,max_len_pred=200):
        super(Seq2Seq, self).__init__()
        self.encoder = lstm_encoder(embedding_dim=embedding_dim, embed_matrix=embedding)
        self.decoder = lstm_decoder(vocab_size=max_len_pred,embedding_dim=embedding_dim)

    def forward(self, source_seq, target_seq, tf=0.9):
        src_batch_size = source_seq.size(0)
        tgt_seq_len = target_seq.size(1)
        
        encoder_states, (encoder_hidden, encoder_cell) = self.encoder(source_seq)

        decoder_vocab_size = self.decoder.vocab_size
        decoder_outputs = torch.zeros(src_batch_size, tgt_seq_len, decoder_vocab_size)
        predicted_sequence = torch.zeros(src_batch_size, tgt_seq_len)

        decoder_input_token = target_seq[:, 0]  # Initial decoder input
        predicted_sequence[:, 0] = decoder_input_token
        
        for step in range(1, tgt_seq_len):
            decoder_output, (encoder_hidden, encoder_cell) = self.decoder(decoder_input_token, (encoder_hidden, encoder_cell))
            decoder_output = decoder_output.squeeze(1)
            decoder_outputs[:, step, :] = decoder_output
            use_teacher_forcing = np.random.random() < tf
            decoder_input_token = target_seq[:, step] if use_teacher_forcing else decoder_output.argmax(dim=1)
            predicted_sequence[:, step] = decoder_output.argmax(dim=1)

        return decoder_outputs, predicted_sequence


In [16]:
model=Seq2Seq(embedding=embedding,embedding_dim=200,max_len_pred=len(train_data.sol_unique_words)).to(device)
def train(epochs):
    st=time.time()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    train_l=[]
    val_l=[]
    for epoch in range(epochs):
        print(f"=======================epoch {epoch}================================")
        model.train()
        l=[]
        for j,data1 in enumerate(train_loader):
            optimizer.zero_grad()
            x=data1[0].to(device)
            y=data1[1].to(device)
            o,w=model(x,y,tf=0.6)
            o = o.reshape(-1, o.shape[2]).to(device)
            y_orig = y.reshape(-1).to(device)
            loss = criterion(o, y_orig)
            loss.backward()
            optimizer.step()
            l.append(loss.item())
        train_l.append(np.mean(l))
        model.eval()
        l=[]
        for j,data1 in enumerate(validation_loader):
            x=data1[0].to(device)
            y=data1[1].to(device)
            o,w=model(x,y,tf=0)
            o = o.reshape(-1, o.shape[2]).to(device)
            y_orig = y.reshape(-1).to(device)
            loss = criterion(o, y_orig)
            l.append(loss.item())
        val_l.append(np.mean(l))
        print(f"epoch : {epoch} train_loss : {train_l[-1]} val_loss: {val_l[-1]} time taken : {(time.time()-st)/60}  ")
        print("model saved!!")
        torch.save(model, 'modelA.pth')
    return train_l,val_l



In [17]:
train_l,val_l=train(50)
print("training completed")

epoch : 0 train_loss : 0.6141041409584784 val_loss: 2.946491840037894 time taken : 0.8912498315175374  
model saved!!
epoch : 1 train_loss : 0.43166935443878174 val_loss: 1.5370731594714713 time taken : 1.7355701684951783  
model saved!!
epoch : 2 train_loss : 0.40644503336760307 val_loss: 1.4334168637052496 time taken : 2.5784053643544516  
model saved!!
epoch : 3 train_loss : 0.38998422795726406 val_loss: 1.8138292710831825 time taken : 3.4241531014442446  
model saved!!
epoch : 4 train_loss : 0.38092816361496523 val_loss: 1.518678014582776 time taken : 4.269885158538818  
model saved!!
epoch : 5 train_loss : 0.37034411391904276 val_loss: 1.4124448958863602 time taken : 5.11061776081721  
model saved!!
epoch : 6 train_loss : 0.36296743572719636 val_loss: 1.3731888012683138 time taken : 5.946636180082957  
model saved!!
epoch : 7 train_loss : 0.3521934397758976 val_loss: 1.264497404402875 time taken : 6.789942701657613  
model saved!!
epoch : 8 train_loss : 0.34189711172253856 val_los

In [18]:
torch.save(model, 'modelA_final_50_epoch.pth')

In [19]:
def generate_csv(model,data,loader):
    model.eval()
    prediction=[]
    for batch in loader:
        x=batch[0].to(device)
        y=batch[1].to(device)
        o,w=model(x,y,tf=0)
        prediction.append(w)
    predicted_data_strings=[]
    for batch in prediction:
        for one_data in batch:
            one_data=one_data.to(torch.int)
            one_data_in_string=""
            conv_int_word=[]
            for pos,value in enumerate(one_data):
                conv_int_word.append(data.sol_int2word[value.item()])
            predicted_data_strings.append(conv_int_word)
    return predicted_data_strings

In [20]:
def extract_sol(data2str):
    complete_data=[]
    for each_data in data2str:
        s=""
        flag=0
        for item in each_data[1:]:
            if(item=="<eos>" or item=="<pad>"):
                complete_data.append(s)
                flag=1
                break
            else:
                s+=str(item)
        if(flag==0):
            complete_data.append(s)
    return complete_data

In [21]:
def generate_sol(int2data,name,data_name):
    json_data=[]
    for k,s in zip(data_name.data,int2data):
        d={
            "Problem":k[0],
            "answer":k[2],
            "predicted":s,
            "linear_formula":k[1]
        }
        json_data.append(d)
    with open(str(name)+".json", 'w') as json_file:
        json.dump(json_data, json_file,indent=4)
    print("prediction json generated!!")

In [22]:
data2string=generate_csv(model,train_data,train_loader)

In [23]:
len(data2string)

19791

In [24]:
good=extract_sol(data2string)

In [25]:
len(good)

19791

In [26]:
generate_sol(good,"on_train_data",train_data)

prediction json generated!!


In [27]:
import csv
def gen_csv(l,name):
    csv_file = name+".csv"
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Loss"])  # Write header
        for loss in l:
            writer.writerow([loss])

In [28]:
gen_csv(train_l,"train_loss")
gen_csv(val_l,"val_loss")
print("csv generated..")

csv generated..
