In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
def clean_file(input_file_path, output_file_path, has_target=True):
    """
    Cleans the input file by removing start and end ' tokens and ensuring no space before the target text.
    Writes the cleaned lines to the output file.

    Parameters:
    - input_file_path: str, path to the input file
    - output_file_path: str, path to the output file
    - has_target: bool, indicates if the file contains target text (default is True)
    """
    # Read the file and process each line
    with open(input_file_path, 'r') as file:
        lines = file.readlines()

    # Remove the start and end ' tokens and ensure no space before the target text
    cleaned_lines = []
    for line in lines:
        # Remove start and end ' tokens
        line = line.replace("'", "").strip()
        # Split the line into input and target parts if target exists
        if has_target:
            parts = line.split(',')
            if len(parts) == 2:
                input_part = parts[0].strip()
                target_part = parts[1].strip()
                cleaned_line = f"{input_part},{target_part}"
                cleaned_lines.append(cleaned_line)
        else:
            cleaned_lines.append(line)

    # Write the cleaned lines to a new file
    with open(output_file_path, 'w') as file:
        for line in cleaned_lines:
            file.write(line + '\n')

    print(f"Cleaned lines have been written to {output_file_path}")

# Define the input and output file paths for training, validation, and test datasets
train_input_file_path = 'Assignment2_train.txt'
train_output_file_path = 'Assignment2_train_cleaned.txt'

validation_input_file_path = 'Assignment2_validation.txt'
validation_output_file_path = 'Assignment2_validation_cleaned.txt'

test_input_file_path = 'Assignment2_Test.txt'
test_output_file_path = 'Assignment2_est_cleaned.txt'

# Clean the training, validation, and test files
clean_file(train_input_file_path, train_output_file_path, has_target=True)
clean_file(validation_input_file_path, validation_output_file_path, has_target=True)
clean_file(test_input_file_path, test_output_file_path, has_target=False)

Cleaned lines have been written to Assignment2_train_cleaned.txt
Cleaned lines have been written to Assignment2_validation_cleaned.txt
Cleaned lines have been written to Assignment2_est_cleaned.txt


In [3]:
#reading data
import pandas as pd
df = pd.read_csv('/data4/home/prabhasreddy/DL_NLP/Assignment-2/Assignment2_train_cleaned.txt', sep=",", header=None)
print(df)

                           0           1
0               march 8 1758  1758-03-08
1           17 february 1709  1709-02-17
2                13 may 1786  1786-05-13
3               17 june 1626  1626-06-17
4               july 25 1851  1851-07-25
...                      ...         ...
35995            jun 16 2050  2050-06-16
35996   tuesday july 22 1524  1524-07-22
35997       28 december 1870  1870-12-28
35998  sun 1655 19 september  1655-09-19
35999       1581 12 november  1581-11-12

[36000 rows x 2 columns]


In [4]:
df.columns = ['dates','translated_dates']

# For validation data
df_valid = pd.read_csv('/data4/home/prabhasreddy/DL_NLP/Assignment-2/Assignment2_Test_cleaned.txt', sep=",", header=None)
df_valid.columns = ['dates']

In [5]:
df.head()

Unnamed: 0,dates,translated_dates
0,march 8 1758,1758-03-08
1,17 february 1709,1709-02-17
2,13 may 1786,1786-05-13
3,17 june 1626,1626-06-17
4,july 25 1851,1851-07-25


In [6]:
#built vocab
src_vocab={}
src_vocab['<st>']=0
src_vocab['<end>']=1
src_vocab['<pad>']=2
k=3
for i in df['dates']:
  for j in i:
    if j not in src_vocab:
      src_vocab[j]=k
      k+=1

tgt_vocab = {}
tgt_vocab['<st>']=0
tgt_vocab['<end>']=1
tgt_vocab['<pad>']=2
k=3
for i in df['translated_dates']:
  for j in i:
    if j not in tgt_vocab:
      tgt_vocab[j]=k
      k+=1

In [7]:
tgt_vocab

{'<st>': 0,
 '<end>': 1,
 '<pad>': 2,
 '1': 3,
 '7': 4,
 '5': 5,
 '8': 6,
 '-': 7,
 '0': 8,
 '3': 9,
 '9': 10,
 '2': 11,
 '6': 12,
 '4': 13}

In [9]:
inverse_src_vocab = {v: k for k, v in src_vocab.items()}
inverse_tgt_vocab = {v: k for k, v in tgt_vocab.items()}

In [70]:
from torch.utils.data import Dataset, DataLoader
import torch

class CustomDatesDataset(Dataset):
    def __init__(self, df, src_vocab, tgt_vocab):
        self.df = df
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx, 0]
        tgt_text = self.df.iloc[idx, 1]

        src_tokens = [self.src_vocab[char] for char in src_text]
        tgt_tokens = [self.tgt_vocab[char] for char in tgt_text]

        return src_tokens, tgt_tokens

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)

    src_max_len = max([len(src) for src in src_batch])
    tgt_max_len = max([len(tgt) for tgt in tgt_batch])

    src_padded = []
    tgt_padded = []

    for src, tgt in zip(src_batch, tgt_batch):
        src_padded.append([src_vocab['<st>']] + src + [src_vocab['<end>']] + [src_vocab['<pad>']] * (src_max_len - len(src)))
        tgt_padded.append([tgt_vocab['<st>']] + tgt + [tgt_vocab['<end>']] + [tgt_vocab['<pad>']] * (tgt_max_len - len(tgt)))

    src_padded = torch.tensor(src_padded, dtype=torch.long)
    tgt_padded = torch.tensor(tgt_padded, dtype=torch.long)

    return src_padded, tgt_padded

# Build dataloaders
train_dataset = CustomDatesDataset(df, src_vocab, tgt_vocab)
train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=collate_fn)


valid_dataset = CustomDatesDataset(df_valid, src_vocab, tgt_vocab)
valid_dataloader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn)

In [10]:
for input,output in train_dataloader:
  print(input.shape,output.shape)
  break

# for input,output in valid_dataloader:
#   print(input.shape,output.shape)
#   break

torch.Size([32, 26]) torch.Size([32, 12])


In [13]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, embed_dim, hidden_dim, num_layers, dropout=0.2):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout_layer(self.embedding(src))  # [batch_size, src_len, embed_dim]
        outputs, hidden = self.gru(embedded)  # outputs: [batch_size, src_len, hidden_dim], hidden: [num_layers, batch_size, hidden_dim]
        return outputs, hidden

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.hidden_dim = hidden_dim
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim]
        
        src_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  # [batch_size, src_len, hidden_dim]
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))  # [batch_size, src_len, hidden_dim]
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        return F.softmax(attention, dim=1)  # [batch_size, src_len]

In [15]:
import torch
import torch.nn as nn

class Decoder(nn.Module):
    def __init__(self, output_dim, embed_dim, hidden_dim, num_layers, attention, dropout=0.2):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, embed_dim)
        self.gru = nn.GRU(embed_dim + hidden_dim, hidden_dim, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        # input: [batch_size]
        # hidden and encoder_outputs are from the encoder initially
        # hidden: [num_layers, batch_size, hidden_dim]
        # encoder_outputs: [batch_size, src_len, hidden_dim]
        
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.dropout_layer(self.embedding(input))  # [batch_size, 1, embed_dim]
        
        attn_weights = self.attention(hidden[-1], encoder_outputs)  # [batch_size, src_len]
        attn_weights = attn_weights.unsqueeze(1)  # [batch_size, 1, src_len]
        
        context = torch.bmm(attn_weights, encoder_outputs)  # [batch_size, 1, hidden_dim]
        
        rnn_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, embed_dim + hidden_dim]
        
        output, hidden = self.gru(rnn_input, hidden)  # output: [batch_size, 1, hidden_dim], hidden: [num_layers, batch_size, hidden_dim]
        
        output = output.squeeze(1)  # [batch_size, hidden_dim]
        context = context.squeeze(1)  # [batch_size, hidden_dim]
        #not needed , just outputs can also be passed
        prediction = self.fc(torch.cat((output, context), dim=1))  # [batch_size, output_dim]
        
        return prediction, hidden

In [16]:
# encoder = Encoder(len(src_vocab),128,256,1)

# sample_input = torch.randint(0, len(src_vocab), (32, 10))  # (batch_size, sequence_length)
# print("Input shape:", sample_input.shape)

# # Pass the sample input through the encoder
# output, hidden = encoder(sample_input)

# # Print the output and hidden shapes
# print("Output shape:", output.shape)
# print("Hidden shape:", hidden.shape)

In [17]:
# #sample input for attention
# attention = Attention(256)


# encoder_output = torch.randn(32, 12, 256)
# dec_hidden = torch.randn(32, 256)
# outputs = attention(dec_hidden, encoder_output)
# #outputs will be of shape (batch_dim* input_seq_len) whree each row sums to 1
# print(outputs.shape)

In [18]:
# decoder = Decoder(len(tgt_vocab),128,256,1)

# sample_input = torch.randint(0, len(tgt_vocab), (32, 2))  # (batch_size, sequence_length)
# print("Input shape:", sample_input.shape)
# print('encoder hidden shapes:', hidden.shape)
# # Pass the sample input through the encoder
# predictions,_ = decoder(sample_input,hidden)

# # Print the output and hidden shapes
# print("Output shape:", predictions.shape)

In [19]:
import torch
import torch.nn as nn

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)

        input = tgt[:, 0]  # <sos> token

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[:, t, :] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1

        return outputs

    def inference(self, src, tgt_len):
        self.eval()
        with torch.no_grad():
            encoder_outputs, hidden = self.encoder(src)
            input = torch.tensor([0] * src.size(0)).to(self.device)  # <sos> token
            generated_sequences = [[] for _ in range(src.size(0))]

            for _ in range(tgt_len):
                output, hidden = self.decoder(input, hidden, encoder_outputs)
                top1 = output.argmax(1)
                input = top1
                for i in range(src.size(0)):
                    generated_sequences[i].append(top1[i].item())

            return generated_sequences

In [20]:
# encodersample = Encoder(len(src_vocab),128,256,1)
# decodersample = Decoder(len(tgt_vocab),128,256,1)
# seq2seq = Seq2Seq(encodersample,decodersample)

# sample_encoder_input = torch.randint(0, len(src_vocab), (32, 10))  # (batch_size, sequence_length)
# sample_decoder_input = torch.randint(0, len(tgt_vocab), (32, 5))  # (batch_size, sequence_length)

# output = seq2seq(sample_encoder_input,sample_decoder_input)
# print(output.shape)

In [21]:
import torch.optim as optim

# Hyperparameters
input_dim = len(src_vocab)  # Example input dimension
output_dim = len(tgt_vocab)  # Example output dimension
embed_dim = 128
hidden_dim = 256
num_layers = 1
dropout = 0.2
num_epochs = 12
learning_rate = 0.001

# Initialize models
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
attention = Attention(hidden_dim)
encoder = Encoder(input_dim, embed_dim, hidden_dim, num_layers, dropout)
decoder = Decoder(output_dim, embed_dim, hidden_dim, num_layers, attention, dropout)
seq2seq = Seq2Seq(encoder, decoder, device).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Assuming 0 is the padding index
optimizer = optim.Adam(seq2seq.parameters(), lr=learning_rate)



In [22]:
# Training loop
for epoch in range(num_epochs):
    seq2seq.train()
    epoch_loss = 0

    for src, tgt in train_dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        optimizer.zero_grad()
        output = seq2seq(src, tgt)

        # Output shape: [batch_size, tgt_len, output_dim]
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_dataloader)}')
    if epoch % 3 == 0:
        torch.save(seq2seq.state_dict(), f'seq2seq_{epoch}.pt')
        
print('Training Finished')

Epoch 1/12, Loss: 0.12659820778564446
Epoch 2/12, Loss: 0.01156341396137658
Epoch 3/12, Loss: 0.011178980942634451
Epoch 4/12, Loss: 0.010212213340525826
Epoch 5/12, Loss: 0.010279141996573243
Epoch 6/12, Loss: 0.009868463377526495
Epoch 7/12, Loss: 0.00936404489133727


KeyboardInterrupt: 

In [62]:
#save the model and load
torch.save(seq2seq.state_dict(), 'seq2seq.pth')

In [22]:
seq2seq = Seq2Seq(encoder, decoder, device).to(device)
seq2seq.load_state_dict(torch.load('seq2seq.pt'))

  seq2seq.load_state_dict(torch.load('seq2seq_6.pt'))


<All keys matched successfully>

In [24]:
# Inference
max_tgt_len = 12
predictions = []
seq2seq.eval()
seq2seq = seq2seq.to(device)  # Ensure the model is on the correct device

for src in valid_dataloader:
    src = src.to(device)
    
    outputs = seq2seq.inference(src, max_tgt_len)
    
    # Apply inverse vocab
    for output in outputs:
        pred = ''
        for token in output:
            if inverse_tgt_vocab[token] == '<st>':
                pred += '0'
            elif inverse_tgt_vocab[token] == '<end>':
                break
            elif inverse_tgt_vocab[token] == '<pad>':
                continue
            else:
                pred += inverse_tgt_vocab[token]
               
        predictions.append(pred)


In [25]:
predictions[:3]

['1733-08-14', '1625-11-24', '1723-01-24']

In [None]:
references = df_valid['dates'].tolist()

In [33]:
#Evaluation metrics
correct =0
for i in range(len(df_valid)):
    reference = references[i]
    candidate = predictions[i]
    
    if reference == candidate:
        correct+=1


accuracy = correct/len(df_valid)
print('Accuracy on Validation data:',accuracy)

In [77]:
"""

calculate Average Validation Set Error in % (using "Exact Match over all 10 outputs" as a metric),
Average Validation Set Error in % (number of mismatches averaged over all 10 outputs)(since the ouput is always 10 characters long),
Numbering the outputs from 1 to 10 (1 for the most significant digit of the year and 10 for the least significant digit of the date), the validation set error (average number of mismatches) for which output was the highest?
Numbering the outputs from 1 to 10 (1 for the most significant digit of the year and 10 for the least significant digit of the date), the validation set error (average number of mismatches) for which output was the lowest?
"""

def calculate_all_errors(actual_outputs, predicted_outputs):
    
    exact_match_error = 0
    mismatch_error = 0
    position_errors = [0]*10
    
    less_than_10 = 0
    
    for actual, predicted in zip(actual_outputs, predicted_outputs):
        
        if len(actual) != 10 or len(predicted) != 10:
            less_than_10 += 1
            continue
        
        exact_match_error += 1 if actual == predicted else 0
        for i in range(10):
            mismatch_error += 1 if actual[i] != predicted[i] else 0
            position_errors[i] += 1 if actual[i] != predicted[i] else 0
            
    highest_error = position_errors.index(max(position_errors)) + 1
    lowest_error = position_errors.index(min(position_errors)) + 1
    
    print("Excat matches : ", exact_match_error)
    print("Less than 10 : ", less_than_10)
        
    exact_match_error = (exact_match_error/len(actual_outputs))*100
    mismatch_error = (mismatch_error/(len(actual_outputs)*10))*100
    
    return exact_match_error, mismatch_error, highest_error, lowest_error

exact_match_error, mismatch_error, highest_error, lowest_error = calculate_all_errors(actual_targets, predictions)


print('Exact Match Error: ', 100-exact_match_error)
print('Mismatch Error: ', mismatch_error)
print('Highest Error: ', highest_error)
print('Lowest Error: ', lowest_error)

Excat matches :  3861
Less than 10 :  0
Exact Match Error:  3.4749999999999943
Mismatch Error:  0.445
Highest Error:  2
Lowest Error:  3


In [None]:
#end of code