In [1]:
import numpy as np
import pandas as pd

import wandb
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# File paths
train_file = '/Users/pratikkadlak/Pratik/IITM/SEM_2/Deep_Learning/Assignment_3/aksharantar_sampled/mar/mar_train.csv'
test_file = '/Users/pratikkadlak/Pratik/IITM/SEM_2/Deep_Learning/Assignment_3/aksharantar_sampled/mar/mar_test.csv'
val_file = '/Users/pratikkadlak/Pratik/IITM/SEM_2/Deep_Learning/Assignment_3/aksharantar_sampled/mar/mar_valid.csv'

# Read data
train_data = pd.read_csv(train_file, header=None)
test_data = pd.read_csv(test_file, header=None)
val_data = pd.read_csv(val_file, header=None)

# Split into English and Marathi words
english_train = train_data.iloc[:, 0]
marathi_train = train_data.iloc[:, 1]

english_test = test_data.iloc[:, 0]
marathi_test = test_data.iloc[:, 1]

english_val = val_data.iloc[:, 0]
marathi_val = val_data.iloc[:, 1]

In [3]:
def create_char_list(words):
    char_list = []
    max_length_word = -1
    for word in words:
        max_length_word = max(max_length_word, len(word))
        for char in word:
            char_list.append(char)
    char_list = list(set(char_list))
    char_list.sort()
    return char_list, max_length_word

def find_max_length(word_list):
    max_length = -1
    for word in word_list:
        max_length = max(max_length, len(word))
    return max_length

# Create character lists and find maximum word lengths
english_chars, english_max_len = create_char_list(english_train)
marathi_chars, marathi_max_len = create_char_list(marathi_train)

# Find maximum word lengths from validation and test data
english_max_len = max(find_max_length(english_val), find_max_length(english_test), english_max_len)
marathi_max_len = max(find_max_length(marathi_val), find_max_length(marathi_test), marathi_max_len)

In [4]:
def word_to_vector(word, lang):
    vector = []
    if(lang == "english"):
        vector.append(len(english_chars) + 1)
        for char in word:
            for i in range(len(english_chars)):
                if(english_chars[i] == char):
                    vector.append(i+1)
    else :
        vector.append(len(marathi_chars) + 1)
        for char in word:
            for i in range(len(marathi_chars)):
                if( marathi_chars[i] == char):
                    vector.append(i+1)
            
    max_len = -1
    if lang == "english": max_len = english_max_len
    else: max_len = marathi_max_len
        
    while(len(vector) < max_len + 1):  # padding with max_length + 1.
        vector.append(0)
            
    vector.append(0)
    return(vector)

In [5]:
# creating matrix of representation for whole words of english and marathi.
def word_matrix(words, language):
    matrix = []
    for word in words:
        matrix.append(word_to_vector(word, language))
    return(matrix)

In [6]:
# Calculate representations of Training English and Marathi words
english_word_representations = word_matrix(english_train, "english")
marathi_word_representations = word_matrix(marathi_train, "marathi")

# Convert to PyTorch tensors
english_matrix = torch.tensor(english_word_representations)
marathi_matrix = torch.tensor(marathi_word_representations)

# Calculate representations for validation data
english_word_representations_val = word_matrix(english_val, "english")
marathi_word_representations_val = word_matrix(marathi_val, "marathi")

# Convert to PyTorch tensors
english_matrix_val = torch.tensor(english_word_representations_val)
marathi_matrix_val = torch.tensor(marathi_word_representations_val)

# Calculate representations for test data
english_word_representations_test = word_matrix(english_test, "english")
marathi_word_representations_test = word_matrix(marathi_test, "marathi")

# Convert to PyTorch tensors
english_matrix_test = torch.tensor(english_word_representations_test)
marathi_matrix_test = torch.tensor(marathi_word_representations_test)

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, num_layers, batch_size, dropout_prob, bidirectional, cell_type):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.dropout = nn.Dropout(dropout_prob)
        self.biderectional = bidirectional
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.cell_type = cell_type
        
        rnn_class = nn.RNN if cell_type == "RNN" else (nn.LSTM if cell_type == "LSTM" else nn.GRU)
        self.rnn = rnn_class(embedding_dim, hidden_size, num_layers, dropout=dropout_prob, bidirectional=bidirectional)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        if self.cell_type == "LSTM":
            output, (hidden, cell) = self.rnn(embedded)
        else:
            output, hidden = self.rnn(embedded)
        
        return (output, hidden, cell) if self.cell_type == "LSTM" else (output, hidden)

    def init_hidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)