## Team members 
1. Sai Hruthik Gangapuram 
2. Sujith Kumar Gajarla

# Task 1: Importing packages

In [1]:
import re
import torch
import random
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
device = 'cpu'

# Task 2: Data Loading

In [2]:
data_frame = pd.read_excel('dataset.xlsx')
data_frame.head(6)

Unnamed: 0,English,Hindi
0,Yale offers advanced degrees through its Gradu...,येल अपने ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज ...
1,Browse the organizations below for information...,"अध्ययन के कार्यक्रमों, शैक्षणिक आवश्यकताओं और ..."
2,Graduate School of Arts & Sciences.,ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज।
3,Yale’s Graduate School of Arts & Sciences offe...,येल के ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज एम...
4,School of Architecture.,स्कूल ऑफ आर्किटेक्चर।
5,The Yale School of Architecture’s mandate is f...,येल स्कूल ऑफ आर्किटेक्चर का जनादेश प्रत्येक छा...


# Task 3: Data preprocessing
1. Word to Index
2. Index to word
3. Word counts
4. Normailizing the sentence

In [3]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = { "SOS": SOS_token, "EOS": EOS_token }
        self.word2count = {}
        self.index2word = { SOS_token: "SOS", EOS_token: "EOS" }
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
def normalizeString(sentence):
    sentence = sentence.lower().strip()
    sentence = sentence.replace('\xa0', ' ')
    sentence = re.sub(r"([,.!?])", r" \1", sentence)
    sentence = re.sub(r"[.!?]+", r"", sentence)
    return sentence
data_frame['English'] = data_frame['English'].apply(lambda sentence: normalizeString(sentence))
data_frame['Hindi'] = data_frame['Hindi'].apply(lambda sentence: normalizeString(sentence))
data_frame.head(5)

Unnamed: 0,English,Hindi
0,yale offers advanced degrees through its gradu...,येल अपने ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज ...
1,browse the organizations below for information...,"अध्ययन के कार्यक्रमों , शैक्षणिक आवश्यकताओं और..."
2,graduate school of arts & sciences,ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज।
3,yale’s graduate school of arts & sciences offe...,येल के ग्रेजुएट स्कूल ऑफ आर्ट्स एंड साइंसेज एम...
4,school of architecture,स्कूल ऑफ आर्किटेक्चर।


In [5]:
def readLangs(data_frame):
    pairs = [list(lang_pair) for index, lang_pair in data_frame.iterrows()]
    input_lang = Lang('English')
    output_lang = Lang('Hindi')
    return input_lang, output_lang, pairs

In [6]:
def prepareData(data_frame):
    input_lang, output_lang, pairs = readLangs(data_frame)
    print("Read %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs
input_lang, output_lang, pairs = prepareData(data_frame)
print(random.choice(pairs))

Read 129 sentence pairs
Counting words...
Counted words:
English 533
Hindi 598
['access to high quality patient‐centered health care is a social right , not a privilege ', 'उच्च गुणवत्ता वाले रोगी-केंद्रित स्वास्थ्य देखभाल तक पहुंच एक सामाजिक अधिकार है , विशेषाधिकार नहीं।']


In [7]:
input_lang.index2word[21]# index to word example in input language

'organizations'

In [8]:
output_lang.index2word[89]# index to word example in output language

'ने'

# Task 4 : Creating Custom Dataset 

In [9]:
class CustomDataset(Dataset):
    
    def __init__(self, df):
        self.df=df
        
    def __len__(self):
        return len(self.df)
    
    def indexesFromSentence(self, lang, sentence):
        return [lang.word2index[word] for word in sentence.split(' ')]
    
    def tensorFromSentence(self, lang, sentence):
        indexes = self.indexesFromSentence(lang, sentence)
        indexes.append(EOS_token)
        return torch.tensor(indexes, dtype=torch.long, device=device)
    
    def __getitem__(self ,idx):
        languages = self.df.iloc[idx]
        input_tensor = self.tensorFromSentence(input_lang, languages['English'])
        target_tensor = self.tensorFromSentence(output_lang, languages['Hindi'])
        return input_tensor, target_tensor, languages['English'], languages['Hindi']

# Task 5: Spliting the dataset into training | testing| validation

In [10]:
training_data, testing_data = train_test_split(data_frame, test_size=0.2, random_state=42)

validation_data, testing_data = train_test_split(testing_data, test_size=0.5, random_state=42)

In [11]:
train_data_set = CustomDataset(training_data)
valid_data_set = CustomDataset(validation_data)
test_data_set = CustomDataset(testing_data)

In [12]:
print('Size of Training dataset: {}'.format(train_data_set.__len__()))
print('Size of Testing dataset: {}'.format(test_data_set.__len__()))
print('Size of Validation dataset: {}'.format(valid_data_set.__len__()))

Size of Training dataset: 103
Size of Testing dataset: 13
Size of Validation dataset: 13


In [20]:
train_data_set[50]# sample

(tensor([368,  78, 344, 369, 164, 366,  42, 370, 371,  18,   1]),
 tensor([420, 258, 217,  30, 195, 413,  53, 421, 251, 358, 171,   1]),
 'we have been expanding international collaborations in many areas ',
 'हम कई क्षेत्रों में अंतरराष्ट्रीय सहयोग का विस्तार कर रहे हैं।')

# Task 6: Loading dataset into Batches

In [14]:
def collate_fn(batch):
    batch = sorted(batch, key=lambda x: len(x[0]), reverse=True)
    input_seqs, target_seqs, input_language, out_language = zip(*batch)
    # Pad the input sequences with zeros
    padded_input = pad_sequence(input_seqs, batch_first=True)
    # Pad the target sequences with zeros
    padded_target = pad_sequence(target_seqs, batch_first=True)
    return padded_input, padded_target, input_language, out_language

In [15]:
train_loader = DataLoader(train_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

val_loader = DataLoader(valid_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

test_loader = DataLoader(test_data_set, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [16]:
print('Total number of batches in train data loader: {}'.format(len(train_loader)))
print('Total number of batches in test data loader: {}'.format(len(test_loader)))
print('Total number of batches in validation data loader: {}'.format(len(val_loader)))

Total number of batches in train data loader: 13
Total number of batches in test data loader: 2
Total number of batches in validation data loader: 2


# Task 7: Displaying 1st sample in each batch

### Train data loader

In [23]:
for batch_index, packed in enumerate(train_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mTraining Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mTraining Batch number-----> 1[0m
Input Language: interested applicants must apply directly to the school , college , or program where the degree will be awarded 
Input Tensor Shape: torch.Size([21])
Input Tensor: tensor([272, 273, 274, 275, 276,  35,  20,   9,  28, 277,  28, 213, 278, 279,
         20, 280, 281, 269, 282,  18,   1])
Output Language: इच्छुक आवेदकों को सीधे स्कूल , कॉलेज या कार्यक्रम में आवेदन करना होगा जहां डिग्री प्रदान की जाएगी।
Output Tensor Shape: torch.Size([22])
Output Tensor: tensor([313, 314,  37, 315,   5,  24, 316, 239,  47,  30, 317, 175, 318, 319,
         18,  19, 129, 320,   1,   0,   0,   0])
------------------------------------------------------------------------------------------------------------


[1mTraining Batch number-----> 2[0m
Input Language: the yale school of architecture’s mandate is for each student to understand architecture as a creative , productive , innovative , and responsible practice 
Input Tensor Shape: torch.Size([26])
Input

### Test data loader

In [18]:
for batch_index, packed in enumerate(test_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mTesting Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mTesting Batch number-----> 1[0m
Input Language: opportunities for study or research abroad as well as exchange programs are managed by the individual schools and programs 
Input Tensor Shape: torch.Size([21])
Input Tensor: tensor([137,  23,  27, 213,  32, 202,  52, 214,  52, 203,  26, 144, 215, 216,
         20, 217,  17,  14,  26,  18,   1])
Output Language: विदेशों में अध्ययन या अनुसंधान के अवसरों के साथ-साथ विनिमय कार्यक्रमों का प्रबंधन व्यक्तिगत स्कूलों और कार्यक्रमों द्वारा किया जाता है।
Output Tensor Shape: torch.Size([22])
Output Tensor: tensor([238,  30,  22, 239,  28,  14, 240,  14, 241, 225,  23,  53, 164, 242,
         13,  10,  23, 243, 244, 245,  21,   1])
------------------------------------------------------------------------------------------------------------


[1mTesting Batch number-----> 2[0m
Input Language: yale offers significant financial assistance to international students to cover tuition costs as it does with students from the u s 
Input Tensor Shape: 

### Validation data loader

In [19]:
for batch_index, packed in enumerate(val_loader):
    input_tensors, output_tensors, input_language, out_language = packed
    print("\033[1mValidation Batch number-----> {}\033[0m".format(batch_index+1))
    # print the first input and output tensors along with their respective languages
    print("Input Language:", input_language[0])
    print("Input Tensor Shape:", input_tensors[0].shape)
    print("Input Tensor:", input_tensors[0])
    
    print("Output Language:", out_language[0])
    print("Output Tensor Shape:", output_tensors[0].shape)
    print("Output Tensor:", output_tensors[0])
    print('------------------------------------------------------------------------------------------------------------')
    print("\n")

[1mValidation Batch number-----> 1[0m
Input Language: the jackson school of global affairs trains and equips a new generation of leaders to devise thoughtful , evidence-based solutions for challenging global problems 
Input Tensor Shape: torch.Size([26])
Input Tensor: tensor([ 20, 114,   9,  10, 102, 115, 116,  14, 117,  37, 118, 119,  10,  72,
         35, 120, 121,  28, 122, 123,  23, 124, 102, 104,  18,   1])
Output Language: जैक्सन स्कूल ऑफ ग्लोबल अफेयर्स चुनौतीपूर्ण वैश्विक समस्याओं के लिए विचारशील , साक्ष्य-आधारित समाधान तैयार करने के लिए नेताओं की एक नई पीढ़ी को प्रशिक्षित और सुसज्जित करता है।
Output Tensor Shape: torch.Size([30])
Output Tensor: tensor([135,   5,   6, 136, 138, 139, 113, 115,  14,  32, 140,  24, 141, 116,
        142, 117,  14,  32,  82, 129,  57, 143, 144,  37, 145,  10, 146,  20,
         21,   1])
------------------------------------------------------------------------------------------------------------


[1mValidation Batch number-----> 2[0m
Input Langu