In [1]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
import torchtext.vocab as vocab
from torchtext.vocab import GloVe
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import time
import torch.optim as optim
import matplotlib.pyplot as plt
from ordered_set import OrderedSet
import random
from transformers import BertModel
from transformers import BertTokenizer
# glove_dim=100
# glove = vocab.GloVe(name='6B', dim=glove_dim) 

In [2]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
special_tags=["<sos>","<eos>","<unk>","<pad>"]
symbols = ["(", ")", ",","|","const_"]
symbols.extend(range(10))
pos_special_tokens_prob=[0,0,0,0]
pos_special_tokens_sol=[0,0,0,0]
embed_dim=100
beam_size=10

In [4]:
def collate(batch):
    
    max_len_problem = max([len(sample[0]) for sample in batch])
    max_len_solution = max([len(sample[1]) for sample in batch])
    
    padded_prob = torch.empty((len(batch), max_len_problem), dtype=torch.long)
    padded_prob.fill_(0)
    padded_sol = torch.empty((len(batch), max_len_solution), dtype=torch.long)
    padded_sol.fill_(pos_special_tokens_sol[3])
    ans=torch.zeros(len(batch))
    prob_attn_mask = torch.zeros((len(batch), max_len_problem), dtype=torch.long)
    for idx in range(len(batch)):
        prob_len = len(batch[idx][0])
        ans[idx]=batch[idx][2]
        padded_prob[idx, :len(batch[idx][0])] = torch.LongTensor(batch[idx][0])
        padded_sol[idx, :len(batch[idx][1])] = torch.LongTensor(batch[idx][1])
        prob_attn_mask[idx, :prob_len] = torch.ones((1, prob_len), dtype=torch.long)
    return (padded_prob,padded_sol,prob_attn_mask,ans)


In [5]:
class load_data_train(Dataset):
    def __init__(self,json_path):
        self.path=json_path
        self.data=[]
        self.loaddata()
        self.problem_unique_words,self.sol_unique_words=self.gen_all_unique_words()
        self.problem_word2int = {word: i for i, word in enumerate(self.problem_unique_words)}
        self.problem_int2word = {i: word for word, i in self.problem_word2int.items()}
        self.sol_word2int = {word: i for i, word in enumerate(self.sol_unique_words)}
        self.sol_int2word = {i: word for word, i in self.sol_word2int.items()}
        self.max_problem_len=self.get_max_len()
        self.get_special_pos_prob()
        self.get_special_pos_sol()
        self.en_tokenizer =  BertTokenizer.from_pretrained("bert-base-cased")
    def get_special_pos_prob(self):
        for i,t in enumerate(special_tags):
            pos_special_tokens_prob[i]=self.problem_word2int[t]
            
    def get_special_pos_sol(self):
        for i,t in enumerate(special_tags):
            pos_special_tokens_sol[i]=self.sol_word2int[t]
            
    def gen_all_unique_words(self):
        u1=set(special_tags)
        u2=set(symbols+special_tags)
        for i,(prob,sol,a) in enumerate(self.data):
            for word in prob.split():
                u1.add(word)
            operations = sol.split("|")
            for operation in operations:
                if not operation:  
                    continue
                operation_name = operation.split("(")[0]
                u2.add(operation_name) 
                content = operation[operation.find("(")+1:operation.find(")")]
                tokens = content.split(",")
                u2.update(tokens)
        return u1,u2
    
    def tokanize_problem(self,i):
#         return self.data[i][0].split()
        return self.en_tokenizer.encode(self.data[i][0])
    
    def tokanize_sol(self,i):
        l=[]
        operations = self.data[i][1].split("|")
        for j,operation in enumerate(operations):
            if not operation:  
                continue
            operation_name = operation.split("(")[0]
            l.append(operation_name) 
            l.append("(")
            content = operation[operation.find("(")+1:operation.find(")")]
            tokens = content.split(",")
            new=[]
            for i, token in enumerate(tokens):
                new.append(token)
                if i < len(tokens) - 1:
                    new.append(",")
            l.extend(new)
            l.append(")")
            if j < len(operations) - 1:
                    l.append("|")
        return l
    
    def get_max_len(self):
        m=0
        for (p,s,a) in self.data:
            for p1 in p:
                m=max(m,len(p1.split()))
        return m

    def loaddata(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            for i in data:
                #list(i["Problem"].split())
#                 p=str(i["Problem"]).split()
                self.data.append((i["Problem"],i["linear_formula"],i["answer"]))
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        problem=self.tokanize_problem(i)
        sol=["<sos>"]+self.tokanize_sol(i)+["<eos>"]
#         problem = [self.problem_word2int[q] if q in self.problem_word2int else self.problem_word2int["<unk>"] for q in problem]
        sol = [self.sol_word2int[q] if q in self.sol_word2int else self.sol_word2int["<unk>"] for q in sol]
        return ((problem,sol,self.data[i][2]))



In [6]:
class load_data_test(Dataset):
    def __init__(self,json_path,train):
        self.path=json_path
        self.data=[]
        self.loaddata()
        self.problem_unique_words,self.sol_unique_words=train.problem_unique_words,train.sol_unique_words
        self.problem_word2int = train.problem_word2int
        self.problem_int2word = train.problem_int2word
        self.sol_word2int = train.sol_word2int
        self.sol_int2word =train.sol_int2word
        self.en_tokenizer =  BertTokenizer.from_pretrained("bert-base-cased")
    def tokanize_problem(self,i):
        return self.en_tokenizer.encode(self.data[i][0])
    
    def tokanize_sol(self,i):
        l=[]
        operations = self.data[i][1].split("|")
        for j,operation in enumerate(operations):
            if not operation:  
                continue
            operation_name = operation.split("(")[0]
            l.append(operation_name) 
            l.append("(")
            content = operation[operation.find("(")+1:operation.find(")")]
            tokens = content.split(",")
            new=[]
            for i, token in enumerate(tokens):
                new.append(token)
                if i < len(tokens) - 1:
                    new.append(",")
            l.extend(new)
            l.append(")")
            if j < len(operations) - 1:
                    l.append("|")
        return l

    def loaddata(self):
        with open(self.path, 'r') as f:
            data = json.load(f)
            for i in data:
                #list(i["Problem"].split())
#                 p=str(i["Problem"]).split()
                self.data.append((i["Problem"],i["linear_formula"],i["answer"]))
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, i):
        problem=self.tokanize_problem(i)
        sol=["<sos>"]+self.tokanize_sol(i)+["<eos>"]
#         problem = [self.problem_word2int[q] if q in self.problem_word2int else self.problem_word2int["<unk>"] for q in problem]
        sol = [self.sol_word2int[q] if q in self.sol_word2int else self.sol_word2int["<unk>"] for q in sol]
        return ((problem,sol,self.data[i][2]))



In [7]:
train_path="/kaggle/input/new-wp/data/train.json"
train_data=load_data_train(train_path)
train_loader = DataLoader(train_data, batch_size=64, shuffle=False, collate_fn=collate)

test_path="/kaggle/input/new-wp/data/test.json"
test_data=load_data_test(test_path,train_data)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False, collate_fn=collate)



validation_path="/kaggle/input/new-wp/data/dev.json"
val_data=load_data_test(validation_path,train_data)
validation_loader = DataLoader(val_data, batch_size=64, shuffle=False, collate_fn=collate)
validation_loader1 = DataLoader(val_data, batch_size=1, shuffle=False, collate_fn=collate)


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:


class BERT_Encoder(nn.Module):
    def __init__(self):
        super(BERT_Encoder, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        for param in self.bert.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state  

In [9]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim=768, dec_hid_dim=128):
        super(Attention, self).__init__()
        self.attn = nn.Linear(enc_hid_dim + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        batch_size = encoder_outputs.shape[0]
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2))) 
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)


In [10]:
class LSTM_Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim=200, enc_hid_dim=768, dec_hid_dim=128, dropout=0.5):
        super(LSTM_Decoder, self).__init__()
        self.attention = Attention()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + enc_hid_dim, dec_hid_dim, batch_first=True)
        self.fc_out = nn.Linear(enc_hid_dim + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell, encoder_outputs):
        embedded = self.dropout(self.embedding(input.unsqueeze(1))) 
        attn_weighted = self.attention(encoder_outputs, hidden[0]) 
        attn_weighted = attn_weighted.unsqueeze(1) 
        weighted = torch.bmm(attn_weighted, encoder_outputs)  
        rnn_input = torch.cat((embedded, weighted), dim=2)  
        output, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=2).squeeze(1))
        return prediction, hidden, cell


In [11]:
class Seq2Seq(nn.Module):
    def __init__(self,max_len_pred=None,device=device):
        super(Seq2Seq, self).__init__()
        self.encoder = BERT_Encoder()
        self.decoder = LSTM_Decoder(output_dim=max_len_pred)
        self.device = device

    def forward(self, src,trg, src_mask,  teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.embedding.num_embeddings
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        encoder_outputs = self.encoder(src, src_mask)
        hidden = torch.zeros(1, batch_size, self.decoder.rnn.hidden_size).to(self.device)
        cell = torch.zeros(1, batch_size, self.decoder.rnn.hidden_size).to(self.device)
        input = trg[:, 0]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[:, t] if teacher_force else top1
        predicted_tokens = outputs.argmax(2) 
        return outputs,predicted_tokens


In [12]:
model=Seq2Seq(max_len_pred=len(train_data.sol_unique_words),device=device)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model=model.to(device)
def train(epochs):
    st=time.time()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    train_l=[]
    val_l=[]
    for epoch in range(epochs):
        print(f"=======================epoch {epoch}================================")
        model.train()
        l=[]
        for j,data1 in enumerate(train_loader):
            optimizer.zero_grad()
            x=data1[0].to(device)
            y=data1[1].to(device)
            att=data1[2].to(device)
            o,w=model(x,y,att,teacher_forcing_ratio=0.6)
            o = o.reshape(-1, o.shape[2]).to(device)
            y_orig = y.reshape(-1).to(device)
            loss = criterion(o, y_orig)
            loss.backward()
            optimizer.step()
            l.append(loss.item())
        train_l.append(np.mean(l))
        model.eval()
        l=[]
        for j,data1 in enumerate(validation_loader):
            x=data1[0].to(device)
            y=data1[1].to(device)
            att=data1[2].to(device)
            o,w=model(x,y,att,teacher_forcing_ratio=0)
            o = o.reshape(-1, o.shape[2]).to(device)
            y_orig = y.reshape(-1).to(device)
            loss = criterion(o, y_orig)
            l.append(loss.item())
        val_l.append(np.mean(l))
        print(f"epoch : {epoch} train_loss : {train_l[-1]} val_loss: {val_l[-1]} time taken : {(time.time()-st)/60}  ")
#         print("model saved!!")
#         torch.save(model, 'modelc_inloop.pth')
    return train_l,val_l

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [13]:
# import gc
# gc.collect()

# # Clear memory allocated by PyTorch
# torch.cuda.empty_cache()

In [14]:
train_l,val_l=train(40)
print("training completed")

epoch : 0 train_loss : 0.6138025201616749 val_loss: 1.3670819262240796 time taken : 3.250207789738973  
epoch : 1 train_loss : 0.4132270635135712 val_loss: 1.504035510915391 time taken : 6.492456551392873  
epoch : 2 train_loss : 0.3896816933587674 val_loss: 1.517240673937696 time taken : 9.730725026130676  
epoch : 3 train_loss : 0.3766568413184535 val_loss: 1.4907272876577173 time taken : 12.968724763393402  
epoch : 4 train_loss : 0.36404849632132436 val_loss: 1.438018316918231 time taken : 16.204140325387318  
epoch : 5 train_loss : 0.3553561058255934 val_loss: 1.4427542597689527 time taken : 19.43970971107483  
epoch : 6 train_loss : 0.3460872680189148 val_loss: 1.4359818265793172 time taken : 22.66841716368993  
epoch : 7 train_loss : 0.33935243037919843 val_loss: 1.4360903653692692 time taken : 25.903385174274444  
epoch : 8 train_loss : 0.33275113307660625 val_loss: 1.4062115296404412 time taken : 29.136480375130972  
epoch : 9 train_loss : 0.32802319175774053 val_loss: 1.22748

In [15]:
torch.save(model, 'modelc_final_40_epoch.pth')

In [16]:
import csv
def gen_csv(l,name):
    csv_file = name+".csv"
    with open(csv_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Loss"])  # Write header
        for loss in l:
            writer.writerow([loss])
gen_csv(train_l,"train_loss_model3")
gen_csv(val_l,"val_loss_model3")
print("csv generated..")

csv generated..


# Beam

In [17]:
class BeamSearch3():
    def __init__(self, model,single_data,model_type, device,  max_target_len=80, beam_size=beam_size):
        self.model = model
        self.device = device
        self.start_token = pos_special_tokens_sol[0]
        self.en_ht = None
        self.en_ct = None
        self.encoder_out=None
        self.decoder_input_token=None
        self.max_target_len = max_target_len
        self.beam_size = beam_size
        self.single_data=single_data
        self.get_encoder_outputs(model_type)
        self.model_type=model_type
        self.beam = [([self.start_token], (self.en_ht, self.en_ct), 0)]


    def get_encoder_outputs(self,model_type):
        if(model_type==1 or model_type==2):
            x=self.single_data[0].to(device)
            self.encoder_out, (self.en_ht, self.en_ct) = self.model.lstm_encoder(x)

        elif(model_type==3 or model_type==4):
            src=self.single_data[0].to(device)
            src_mask=self.single_data[2].to(device)
            batch_size = src.shape[0]
            self.encoder_out = self.model.module.encoder(src, src_mask)
            self.en_ht = torch.zeros(1, batch_size, self.model.module.decoder.rnn.hidden_size).to(self.device)
            self.en_ct = torch.zeros(1, batch_size, self.model.module.decoder.rnn.hidden_size).to(self.device)
            

    def search(self):
        for _ in range(self.max_target_len - 1):
            self._expand_beam(self.model_type)
            self.beam.sort(key=lambda x: x[2])
            self.beam = self.beam[:self.beam_size]

        best_candidate = self.beam[0][0]
        decoded_words = self._construct_output(best_candidate)
        return decoded_words

    def _expand_beam(self,model_type):
        new_beam = []
        for sequence, (ht, ct), score in self.beam:
            prev_token = torch.LongTensor([sequence[-1]]).to(self.device)

            if(model_type==1):
                decoder_out, (ht, ct) = self.model.decoder(prev_token, (ht, ct))
            elif(model_type==2):
                decoder_out, (ht, ct) = self.model.lstm_decoder(prev_token, (ht, ct),self.encoder_out)
            elif(model_type==3 or model_type==4):
                decoder_out, ht, ct= self.model.module.decoder(prev_token, ht, ct,self.encoder_out)

            decoder_out = decoder_out.squeeze(1)
            top_vals, top_inds = decoder_out.topk(self.beam_size, dim=1)

            self._add_candidates(new_beam, sequence, ht, ct, score, top_vals, top_inds)

        self.beam = new_beam

    def _add_candidates(self, new_beam, sequence, ht, ct, score, top_vals, top_inds):
        for j in range(self.beam_size):
            new_word_idx = top_inds[0][j]
            new_seq = sequence + [new_word_idx.item()]
            new_word_prob = torch.log(top_vals[0][j])
            updated_score = score - new_word_prob
            new_candidate = (new_seq, (ht, ct), updated_score)
            new_beam.append(new_candidate)

    def _construct_output(self, best_candidate):
        decoded_words = torch.zeros(1, self.max_target_len)
        for t, idx in enumerate(best_candidate):
            decoded_words[:, t] = torch.LongTensor([idx])
        return decoded_words

In [18]:
def generate_csv(model,data,loader):
    model.eval()
    prediction=[]
    for batch in loader:
        x=batch[0].to(device)
        y=batch[1].to(device)
        o,w=model(x,y,tf=0)
        prediction.append(w)
    predicted_data_strings=[]
    for batch in prediction:
        for one_data in batch:
            one_data=one_data.to(torch.int)
            one_data_in_string=""
            conv_int_word=[]
            for pos,value in enumerate(one_data):
                conv_int_word.append(data.sol_int2word[value.item()])
            predicted_data_strings.append(conv_int_word)
    return predicted_data_strings

In [19]:
def extract_sol(data2str):
    complete_data=[]
    for each_data in data2str:
        s=""
        flag=0
        for item in each_data[1:]:
            if(item=="<eos>" or item=="<pad>"):
                complete_data.append(s)
                flag=1
                break
            else:
                s+=str(item)
        if(flag==0):
            complete_data.append(s)
    return complete_data

In [20]:
def generate_sol(int2data,name,data_name):
    json_data=[]
    for k,s in zip(data_name.data,int2data):
        d={
            "Problem":k[0],
            "answer":k[2],
            "predicted":s,
            "linear_formula":k[1]
        }
        json_data.append(d)
    with open(str(name)+".json", 'w') as json_file:
        json.dump(json_data, json_file,indent=4)
    print("prediction json generated!!")


In [21]:
def convtostr(prediction,data):
    predicted_data_strings=[]
    for batch in prediction:
        for one_data in batch:
            one_data=one_data.to(torch.int)
            one_data_in_string=""
            conv_int_word=[]
            for pos,value in enumerate(one_data):
                conv_int_word.append(data.sol_int2word[value.item()])
            predicted_data_strings.append(conv_int_word)
    return predicted_data_strings

In [22]:
def get_me_final_file(op_test,data,file_name):
    prediction_in_str=convtostr(op_test,data)
    good=extract_sol(prediction_in_str)
    generate_sol(good,file_name,data)

In [23]:
op_test=[]
for i,one_data in enumerate(validation_loader1):
    beam=BeamSearch3(model=model,single_data=one_data,model_type=3, device=device,  max_target_len=300, beam_size=10)
    op_test.append(beam.search())
    print(f"{i}th data completed")
get_me_final_file(op_test,val_data,"valid_beam_modelC")
op_test=[]
for i,one_data in enumerate(test_loader):
    beam=BeamSearch3(model=model,single_data=one_data,model_type=3, device=device,  max_target_len=300, beam_size=10)
    op_test.append(beam.search())
    print(f"{i}th data completed")
get_me_final_file(op_test,test_data,"test_beam_modelC")

0th data completed
1th data completed
2th data completed
3th data completed
4th data completed
5th data completed
6th data completed
7th data completed
8th data completed
9th data completed
10th data completed
11th data completed
12th data completed
13th data completed
14th data completed
15th data completed
16th data completed
17th data completed
18th data completed
19th data completed
20th data completed
21th data completed
22th data completed
23th data completed
24th data completed
25th data completed
26th data completed
27th data completed
28th data completed
29th data completed
30th data completed
31th data completed
32th data completed
33th data completed
34th data completed
35th data completed
36th data completed
37th data completed
38th data completed
39th data completed
40th data completed
41th data completed
42th data completed
43th data completed
44th data completed
45th data completed
46th data completed
47th data completed
48th data completed
49th data completed
50th data 