In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
df = pd.read_csv('./Assignment2aDataset.txt',header=None).dropna()
df.columns = ['description','target']
print(df.describe())

          description         target
count           40000          40000
unique          39429          36260
top     'aug 14 1641'   '2042-04-16'
freq                3              4


In [3]:
# Define preprocessing functions to convert text to numerical values
def text_to_tensor(text, vocab):
    return [vocab[char] for char in text]

In [4]:
def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

In [5]:
# Create vocabulary mappings for characters
vocab = {char: idx+1 for idx, char in enumerate("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'/ ")}
vocab['<pad>'] = 0

In [12]:
# Convert dataset to numerical format
max_seq_length = max(len(date) for date in df['description'])
print(max_seq_length)
numerical_data = [(text_to_tensor(row['description'], vocab), text_to_tensor(row['target'], vocab)) for index, row in df.iterrows()]
print(type(numerical_data))
print(len(numerical_data))


29
<class 'list'>
40000


In [13]:
# Define the Encoder-Decoder model with attention
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        output, hidden = self.gru(embedded)
        return output, hidden

In [14]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(0)
        hidden = hidden.repeat(seq_len, 1, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = torch.softmax(torch.sum(self.v * energy, dim=2), dim=0)
        return attention


In [23]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_step, last_hidden, encoder_outputs):
        input_step = torch.tensor(input_step,dtype=torch.float32)
        embedded = self.embedding(input_step)
        output, hidden = self.gru(embedded, last_hidden)
        attn_weights = attention(last_hidden[-1], encoder_outputs)
        context = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
        output = output + context
        output = output.squeeze(0)
        output = self.out(output)
        output = self.softmax(output)
        return output, hidden


In [24]:
hidden_size = 100
learning_rate = 0.001
num_epochs = 10

# Initialize model and optimizer
encoder = Encoder(len(vocab), hidden_size)
attention = Attention(hidden_size)
decoder = Decoder(len(vocab), hidden_size)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

criterion = nn.NLLLoss()

In [25]:
# Training loop
for epoch in range(num_epochs):
    for input_seq, target_seq in numerical_data:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        input_length = len(input_seq)
        target_length = len(target_seq)

        encoder_outputs, encoder_hidden = encoder(input_seq)
        decoder_input = torch.tensor([vocab['<pad>']])

        loss = 0

        for di in range(target_length):
            decoder_output, decoder_hidden = decoder(decoder_input, encoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()
            loss += criterion(decoder_output, target_seq[di])

        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


TypeError: embedding(): argument 'indices' (position 2) must be Tensor, not list

In [26]:
# Inference and visualization of attention
def translate_date(input_date):
    input_tensor = torch.tensor(text_to_tensor(input_date, vocab))
    encoder_outputs, encoder_hidden = encoder(input_tensor)
    decoder_input = torch.tensor([vocab['<pad>']])
    decoded_words = []

    for di in range(max_seq_length):
        decoder_output, decoder_hidden = decoder(decoder_input, encoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()
        if decoder_input.item() == vocab['<pad>']:
            break
        else:
            decoded_words.append([key for key, value in vocab.items() if value == decoder_input.item()][0])

    return ''.join(decoded_words)

In [22]:
input_date = "Saturday 29 February 2021"
output_date = translate_date(input_date)
print(f"Input Date: {input_date}")
print(f"Translated Date: {output_date}")

RuntimeError: Tensors must have same number of dimensions: got 3 and 2

In [38]:
import phonenumbers
from phonenumbers import geocoder
phonenumber_1 = phonenumbers.parse("+917984209358")
phonenumber_2 = phonenumbers.parse("+919898578717")
# print(geocoder.description_for_number(phonenumber_1,'en'))
print(geocoder.description_for_number(phonenumber_2,'en'))
print(geocoder.description_for_valid_number(phonenumber_1,'en'))

India
Ahmedabad Local, Gujarat
