## Team members 
1. Sai Hruthik Gangapuram 
2. Sujith Kumar Gajarla

# Task 1: Importing packages

In [1]:
import re
import torch
import random
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
device = 'cpu'

# Task 2: Data Loading

In [2]:
data_frame = pd.read_excel('dataset.xlsx')
data_frame.head(6)

Unnamed: 0,English,Hindi
0,Yale offers advanced degrees through its Gradu...,येल अपने ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज ...
1,Browse the organizations below for information...,"अध्ययन के कार्यक्रमों, शैक्षणिक आवश्यकताओं और ..."
2,Graduate School of Arts & Sciences.,ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज।
3,Yale’s Graduate School of Arts & Sciences offe...,येल के ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज एम...
4,School of Architecture.,स्कूल ऑफ आर्किटेक्चर।
5,The Yale School of Architecture’s mandate is f...,येल स्कूल ऑफ आर्किटेक्चर का जनादेश प्रत्येक छा...


# Task 3: Data preprocessing
1. Word to Index
2. Index to word
3. Word counts
4. Normailizing the sentence

In [3]:
START_TOKEN = 0
END_TOKEN = 1

class Language:
    def __init__(self, name):
        self.language_name = name
        self.word_to_index = { "START": START_TOKEN, "END": END_TOKEN }
        self.word_to_count = {}
        self.index_to_word = { START_TOKEN: "START", END_TOKEN: "END" }
        self.num_words = 2  # Count START and END tokens

    def add_sentence(self, sentence):
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        if word not in self.word_to_index:
            self.word_to_index[word] = self.num_words
            self.word_to_count[word] = 1
            self.index_to_word[self.num_words] = word
            self.num_words += 1
        else:
            self.word_to_count[word] += 1

In [4]:
def normalizeString(sentence):
    sentence = sentence.lower().strip()
    sentence = sentence.replace('\xa0', ' ')
    sentence = re.sub(r"([,.!?])", r" \1", sentence)
    sentence = re.sub(r"[.!?]+", r"", sentence)
    return sentence
data_frame['English'] = data_frame['English'].apply(lambda sentence: normalizeString(sentence))
data_frame['Hindi'] = data_frame['Hindi'].apply(lambda sentence: normalizeString(sentence))
data_frame.head(5)

Unnamed: 0,English,Hindi
0,yale offers advanced degrees through its gradu...,येल अपने ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज ...
1,browse the organizations below for information...,"अध्ययन के कार्यक्रमों , शैक्षणिक आवश्यकताओं और..."
2,graduate school of arts & sciences,ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज।
3,yale’s graduate school of arts & sciences offe...,येल के ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज एम...
4,school of architecture,स्कूल ऑफ आर्किटेक्चर।


In [5]:
def read_languages(data_frame):
    pairs = [list(lang_pair) for index, lang_pair in data_frame.iterrows()]
    input_lang = Language('English')
    output_lang = Language('Hindi')
    return input_lang, output_lang, pairs

In [6]:
def preprocess(data):
    source_lang, target_lang, sentence_pairs = read_languages(data)
    print("Read %s sentence pairs" % len(sentence_pairs))
    print("Counting words...")
    for pair in sentence_pairs:
        source_lang.add_sentence(pair[0])
        target_lang.add_sentence(pair[1])
    print("Counted words:")
    print(source_lang.language_name, source_lang.num_words)
    print(target_lang.language_name, target_lang.num_words)
    return source_lang, target_lang, sentence_pairs

source_lang, target_lang, sentence_pairs = preprocess(data_frame)
print(random.choice(sentence_pairs))

Read 129 sentence pairs
Counting words...
Counted words:
English 533
Hindi 598
['yale school of medicine graduates go on to become leaders in academic medicine  ', 'येल स्कूल ऑफ मेडिसिन के स्नातक शैक्षणिक चिकित्सा में अग्रणी बन जाते हैं।']


In [7]:
source_lang.index_to_word[21]# index to word example in input language

'organizations'

In [8]:
target_lang.index_to_word[89]# index to word example in output language

'ने'

# Task 4 : Creating Custom Dataset 

In [9]:
class CustomDataset(Dataset):
    
    def __init__(self, df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def indexesFromSentence(self, lang, sentence):
        return [lang.word_to_index[word] for word in sentence.split(' ')]
    
    def tensorFromSentence(self, lang, sentence):
        indexes = self.indexesFromSentence(lang, sentence)
        indexes.append(END_TOKEN)
        return torch.tensor(indexes, dtype=torch.long, device=device)
    
    def __getitem__(self ,idx):
        languages = self.df.iloc[idx]
        input_tensor = self.tensorFromSentence(source_lang, languages['English'])
        target_tensor = self.tensorFromSentence(target_lang, languages['Hindi'])
        return input_tensor, target_tensor, languages['English'], languages['Hindi']

# Task 5: Spliting the dataset into training | testing| validation

In [10]:
training_data, testing_data = train_test_split(data_frame, test_size=0.2, random_state=42)

validation_data, testing_data = train_test_split(testing_data, test_size=0.5, random_state=42)

In [11]:
train_data_set = CustomDataset(training_data)
valid_data_set = CustomDataset(validation_data)
test_data_set = CustomDataset(testing_data)

In [12]:
print('Size of Training dataset: {}'.format(train_data_set.__len__()))
print('Size of Testing dataset: {}'.format(test_data_set.__len__()))
print('Size of Validation dataset: {}'.format(valid_data_set.__len__()))

Size of Training dataset: 103
Size of Testing dataset: 13
Size of Validation dataset: 13


In [13]:
train_data_set[50]# sample

(tensor([368,  78, 344, 369, 164, 366,  42, 370, 371,  18,   1]),
 tensor([420, 258, 217,  30, 195, 413,  53, 421, 251, 358, 171,   1]),
 'we have been expanding international collaborations in many areas ',
 'हम कई क्षेत्रों में अंतरराष्ट्रीय सहयोग का विस्तार कर रहे हैं।')

# Task 6: Loading dataset into Batches

In [14]:
def collate_fn(batch):
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    input_seqs, target_seqs, input_language, out_language = zip(*batch)
    # Pad the input sequences with zeros
    padded_input = pad_sequence(input_seqs, batch_first=True)
    # Pad the target sequences with zeros
    padded_target = pad_sequence(target_seqs, batch_first=True)
    return padded_input, padded_target, input_language, out_language

In [15]:
train_loader = DataLoader(train_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

val_loader = DataLoader(valid_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

test_loader = DataLoader(test_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [16]:
print('Total number of batches in train data loader: {}'.format(len(train_loader)))
print('Total number of batches in test data loader: {}'.format(len(test_loader)))
print('Total number of batches in validation data loader: {}'.format(len(val_loader)))

Total number of batches in train data loader: 13
Total number of batches in test data loader: 2
Total number of batches in validation data loader: 2


# Task 7: Displaying 1st sample in each batch

### Train data loader

In [17]:
for batch_index, packed in enumerate(train_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mTraining Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mTraining Batch number-----> 1[0m
Input Language: the yale school of art has a long and distinguished history of training artists of the highest caliber 
Input Tensor Shape: torch.Size([20])
Input Tensor: tensor([20,  2,  9, 10, 58, 59, 37, 60, 14, 61, 62, 10, 63, 64, 10, 20, 65, 66,
        18,  1])
Output Language: येल स्कूल ऑफ आर्ट में उच्चतम क्षमता के प्रशिक्षण कलाकारों का एक लंबा और विशिष्ट इतिहास है।
Output Tensor Shape: torch.Size([23])
Output Tensor: tensor([ 2,  5,  6, 67, 30, 68, 69, 14, 70, 71, 53, 57, 72, 10, 73, 74, 21,  1,
         0,  0,  0,  0,  0])
------------------------------------------------------------------------------------------------------------


[1mTraining Batch number-----> 2[0m
Input Language: yale is known for its residential college system , which provides students with a supportive community and numerous opportunities for social and intellectual engagement 
Input Tensor Shape: torch.Size([26])
Input Tensor: tensor([  2,  48, 460,  23,   7, 506, 

### Test data loader

In [18]:
for batch_index, packed in enumerate(test_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mTesting Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mTesting Batch number-----> 1[0m
Input Language: yale offers significant financial assistance to international students to cover tuition costs as it does with students from the u s 
Input Tensor Shape: torch.Size([23])
Input Tensor: tensor([  2,   3, 302, 287, 220,  35, 164, 142,  35, 303, 304, 305,  52, 306,
        307, 211, 142, 238,  20, 308,  38,  18,   1])
Output Language: येल अंतरराष्ट्रीय छात्रों को ट्यूशन की लागत को कवर करने के लिए महत्वपूर्ण वित्तीय सहायता प्रदान करता है जैसा कि यह यू एस  के छात्रों के साथ करता है।
Output Tensor Shape: torch.Size([31])
Output Tensor: tensor([  2, 195, 261,  37, 339, 129, 340,  37, 341, 117,  14,  32, 342, 323,
        249,  19,  20, 106, 343, 344, 345, 346, 347, 348,  14, 261,  14, 235,
         20,  21,   1])
------------------------------------------------------------------------------------------------------------


[1mTesting Batch number-----> 2[0m
Input Language: opportunities for study or research abroad as well as exchange progr

### Validation data loader

In [19]:
for batch_index, packed in enumerate(val_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mValidation Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mValidation Batch number-----> 1[0m
Input Language: the jackson school of global affairs trains and equips a new generation of leaders to devise thoughtful , evidence-based solutions for challenging global problems 
Input Tensor Shape: torch.Size([26])
Input Tensor: tensor([ 20, 114,   9,  10, 102, 115, 116,  14, 117,  37, 118, 119,  10,  72,
         35, 120, 121,  28, 122, 123,  23, 124, 102, 104,  18,   1])
Output Language: जैक्सन स्कूल ऑफ ग्लोबल अफेयर्स चुनौतीपूर्ण वैश्विक समस्याओं के लिए विचारशील , साक्ष्य-आधारित समाधान तैयार करने के लिए नेताओं की एक नई पीढ़ी को प्रशिक्षित और सुसज्जित करता है।
Output Tensor Shape: torch.Size([30])
Output Tensor: tensor([135,   5,   6, 136, 138, 139, 113, 115,  14,  32, 140,  24, 141, 116,
        142, 117,  14,  32,  82, 129,  57, 143, 144,  37, 145,  10, 146,  20,
         21,   1])
------------------------------------------------------------------------------------------------------------


[1mValidation Batch number-----> 2[0m
Input Langu