# Sequence to Sequence Network
This task is about implementing a sequence to sequence network. Basically following this [tutorial](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html).

In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata 
import string 
import re 
import random

import torch 
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Downloading Data
The cell below downloads the data required for this project. 

In [7]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip
!rm data.zip
!ls

--2020-05-18 17:27:39--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.227.185.37, 13.227.185.93, 13.227.185.106, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.227.185.37|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip.1’


2020-05-18 17:27:40 (13.7 MB/s) - ‘data.zip.1’ saved [2882130/2882130]

Archive:  data.zip
replace data/eng-fra.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C
SeqToSeq.ipynb	data  data.zip	data.zip.1


### Lang Class 
Lang class has word2index and index2word dictionaries. Also it has word2count to use to later replace rare words. 
We are using Onehot encoding for words language. 

In [13]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"SOS", 1:"EOS"}
        self.n_words = 2
        
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
    
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else :
            self.word2count[word] += 1

### Preprocessing the data
The files are all in Unicode. To simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation. 
For now, we will consider setences only whose length is less than MAX_LENGTH and which start with eng_prefixes. 

In [22]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"[.!?])", r"\1", s)
    s = re.sub(r"[^a-zA-z.!?]+", r" ", s)
    return s

def readLangs(lang1, lang2, reverse=False) :
    print("Reading Lines....")
    
    lines = open('data/%s-%s.txt'%(lang1, lang2), encoding='utf-8').read().strip().split('\n')
    
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else: 
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
        
    return input_lang, output_lang, pairs
    

In [23]:
MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) <MAX_LENGTH and p[1].startwith(eng_prefixes)

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

### The Encoder 
The encoder of a seq2seq network is a RNN that ouputs some value for every word from teh input setence. For every input word the encoder outputs a vector and a hidden state and uses the hidden state for the next input word. 

In [24]:
class EncoderRNN(nn.Module):
    def __init__t(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded 
        output, hidden = self.gru(output, hidden)
        return ouput, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

### The Decoder 
The decoder is another RNN that takes the encoder output vectors and outputs a sequence of words to creat the translation. 

In [25]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, ouput_size)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)
    