# Imports

In [None]:
import csv
import numpy as np
import os
import torch

# Load Data

In [None]:
def loadData(train_path, val_path, test_path):
    train_data = csv.reader(open(train_path,encoding='utf8'))
    val_data = csv.reader(open(val_path,encoding='utf8'))
    test_data = csv.reader(open(test_path,encoding='utf8'))
    train_words , train_translations = [], []
    val_words , val_translations = [], []
    test_words , test_translations = [], []
    
    for pair in train_data:
        train_words.append(pair[0])
        train_translations.append(pair[1])
    for pair in val_data:
        val_words.append(pair[0])
        val_translations.append(pair[1])
    for pair in test_data:
        test_words.append(pair[0])
        test_translations.append(pair[1])
    
    train_words , train_translations = np.array(train_words), np.array(train_translations)
    val_words , val_translations = np.array(val_words), np.array(val_translations)
    test_words , test_translations = np.array(test_words), np.array(test_translations)
    
    input_tokens, output_tokens = set(), set()
    val_input_tokens, val_output_tokens = set(), set()
    
    for word in train_words:
        for character in word:
            if character not in input_tokens:
                input_tokens.add(character)
    for word in train_translations:
        for character in word:
            if character not in output_tokens:
                output_tokens.add(character)

    input_tokens.add(' ')
    output_tokens.add(' ')
    input_tokens,  output_tokens = sorted(list(input_tokens)), sorted(list(output_tokens))
    
    for word in val_words:
        for character in word:
            if character not in val_input_tokens:
                val_input_tokens.add(character)
    for word in val_translations:
        for character in word:
            if character not in val_output_tokens:
                val_output_tokens.add(character)
                
    result = {
        'train_words' : train_words,
        'train_translations' : train_translations,
        'val_words' : val_words,
        'val_translations' : val_translations,
        'test_words' : test_words,
        'test_translations' : test_translations,
        'input_tokens' : input_tokens,
        'output_tokens' : output_tokens,
        'val_input_tokens' : val_input_tokens,
        'val_output_tokens' : val_output_tokens
    }
    return result

In [None]:

def create_tensor(result):
    max_input_length = max([len(word) for word in result['train_words']]) + 2
    max_output_length = max([len(word) for word in result['train_translations']])
    max_input_length_val = max([len(word) for word in result['val_words']]) + 2
    max_output_length_val = max([len(word) for word in result['val_translations']])
    
    input_index = dict([(char, idx) for idx, char in enumerate(result['input_tokens'])])
    output_index =  dict([(char, idx) for idx, char in enumerate(result['output_tokens'])])
    input_index_rev = dict([(idx, char) for char, idx in input_index.items()])
    output_index_rev = dict([(idx, char) for char, idx in output_index.items()])
    index_dict = {
        'input_index' : input_index,
        'output_index' : output_index,
        'input_index_rev' : input_index_rev,
        'output_index_rev' : output_index_rev
    }

    input_data = np.zeros((max_input_length,len(result['train_words'])), dtype = 'int64')
    output_data = np.zeros((max_output_length,len(result['train_words'])), dtype = 'int64')
    
    val_input_data = np.zeros((max_input_length_val,len(result['val_words'])), dtype = 'int64')
    val_output_data = np.zeros((max_output_length_val,len(result['val_words'])), dtype = 'int64')
    
    for idx, (w, t) in enumerate(zip(result['train_words'], result['train_translations'])):
        for i, char in enumerate(w):
            input_data[i, idx] = input_index[char]
        input_data[i+1 :,idx] = input_index[" "]
        for i, char in enumerate(t):
            output_data[i, idx] = output_index[char]
        output_data[i+1 :,idx] = output_index[" "]
        
    for idx, (w, t) in enumerate(zip(result['val_words'], result['val_translations'])):
        for i, char in enumerate(w):
            val_input_data[i, idx] = input_index[char]
        val_input_data[i+1 :,idx] = input_index[" "]
        for i, char in enumerate(t):
            val_output_data[i, idx] = output_index[char]
        val_output_data[i+1 :,idx] = output_index[" "]
    
    input_data, output_data = torch.tensor(input_data,dtype = torch.int64), torch.tensor(output_data, dtype = torch.int64)
    val_input_data, val_output_data = torch.tensor(val_input_data,dtype = torch.int64), torch.tensor(val_output_data, dtype = torch.int64)
    
    return input_data, output_data, val_input_data, val_output_data, index_dict

In [None]:
language = 'hin'
dataset_path = r'C:\Users\gragh\OneDrive\Desktop\Codes\CS6910 DL\Assignment 3\DataSet\aksharantar_sampled'
train_path = os.path.join(dataset_path, language, language + '_train.csv')
val_path = os.path.join(dataset_path, language, language + '_valid.csv')
test_path = os.path.join(dataset_path, language, language + '_test.csv')

result = loadData(train_path, val_path, test_path)
input_data, output_data, val_input_data, val_output_data, index_dict = create_tensor(result)

# num_samples = len(result['train_words'])
# num_input_tokens = len(result['input_tokens'])
# num_output_tokens = len(result['output_tokens'])
# num_val_input_tokens = len(result['val_input_tokens'])
# num_val_output_tokens = len(result['val_output_tokens'])
# max_input_length_train = max([len(word) for word in result['train_words']]) + 2
# max_output_length_train = max([len(word) for word in result['train_translations']])
# max_input_length_val = max([len(word) for word in result['val_words']]) + 2
# max_output_length_val = max([len(word) for word in result['val_translations']])

In [None]:
# print('Number of samples : ', num_samples)
# print('Number of unique train input tokens : ', num_input_tokens)
# print('Number of unique train output tokens : ', num_output_tokens)
# print('Number of unique val input tokens : ', num_val_input_tokens)
# print('Number of unique val output tokens : ', num_val_output_tokens)
# print('Max sequence length for inputs : ', max_input_length_train)
# print('Max sequence length for outputs : ', max_output_length_train)