In [None]:
import numpy as np
import pandas as pd

import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# File paths
train_file = '/kaggle/input/aksharantar/aksharantar_sampled/mar/mar_train.csv'
test_file = '/kaggle/input/aksharantar/aksharantar_sampled/mar/mar_test.csv'
val_file = '/kaggle/input/aksharantar/aksharantar_sampled/mar/mar_valid.csv'

# Read data
train_data = pd.read_csv(train_file, header=None)
test_data = pd.read_csv(test_file, header=None)
val_data = pd.read_csv(val_file, header=None)

# Split into English and Marathi words
english_train = train_data.iloc[:, 0]
marathi_train = train_data.iloc[:, 1]

english_test = test_data.iloc[:, 0]
marathi_test = test_data.iloc[:, 1]

english_val = val_data.iloc[:, 0]
marathi_val = val_data.iloc[:, 1]

In [None]:
def create_char_list(words):
    char_list = []
    max_length_word = -1
    for word in words:
        max_length_word = max(max_length_word, len(word))
        for char in word:
            char_list.append(char)
    char_list = list(set(char_list))
    char_list.sort()
    return char_list, max_length_word

def find_max_length(words_list):
    max_length = -1
    for words in words_list:
        for word in words:
            max_length = max(max_length, len(word))
    return max_length

# Create character lists and find maximum word lengths
english_chars, english_max_len = create_char_list(english_train)
marathi_chars, marathi_max_len = create_char_list(marathi_train)

# Find maximum word lengths from validation and test data
english_max_len = max(find_max_length(english_val), find_max_length(english_test), english_max_len)
marathi_max_len = max(find_max_length(marathi_val), find_max_length(marathi_test), marathi_max_len)

In [None]:
def word_to_vector(word, language):
    char_list = english_chars if language == "english" else marathi_chars
    max_length = english_max_len if language == "english" else marathi_max_len

    vector = [len(char_list) + 1]  # Start with one for the special token
    for char in word:
        char_index = char_list.index(char) + 1 if char in char_list else 0
        vector.append(char_index)

    # Pad the vector with zeros to match the maximum word length
    while len(vector) < max_length + 1:
        vector.append(0)

    vector.append(0)  # Add an extra zero at the end

    return vector

# Example usage:
english_word = "example"
english_vector = word_to_vector(english_word, "english")
marathi_word = "उदाहरण"
marathi_vector = word_to_vector(marathi_word, "marathi")

In [None]:
def word_matrix(words, language):
    matrix = []
    char_list = english_chars if language == "english" else marathi_chars
    max_length = english_max_len if language == "english" else marathi_max_len

    for word in words:
        vector = word_to_vector(words, language)
        matrix.append(vector)

    return matrix

In [None]:
# Calculate representations of Training English and Marathi words
english_word_representations = word_matrix(english_train, "english")
marathi_word_representations = word_matrix(marathi_train, "marathi")

# Convert to PyTorch tensors
english_matrix = torch.tensor(english_word_representations)
marathi_matrix = torch.tensor(marathi_word_representations)

# Calculate representations for validation data
english_word_representations_val = word_matrix(english_val, "english")
marathi_word_representations_val = word_matrix(marathi_val, "marathi")

# Convert to PyTorch tensors
english_matrix_val = torch.tensor(english_word_representations_val)
marathi_matrix_val = torch.tensor(marathi_word_representations_val)

# Calculate representations for test data
english_word_representations_test = word_matrix(english_test, "english")
marathi_word_representations_test = word_matrix(marathi_test, "marathi")

# Convert to PyTorch tensors
english_matrix_test = torch.tensor(english_word_representations_test)
marathi_matrix_test = torch.tensor(marathi_word_representations_test)