# Seq2Seq with Attention

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import json
from pathlib import Path
import pandas as pd
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from asr_evaluation.asr_evaluation import get_error_count, get_match_count, print_diff

from functools import reduce
from collections import defaultdict
from edit_distance import SequenceMatcher

from termcolor import colored

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Import dataset with 70K pairs of commands

In [None]:
def replace_space(s: str):
    s = s.replace(' ', '_')
    return ' '.join(list(s))

def recover_space(s: str):
    s = s.replace(' ', '')
    s = s.replace('_', ' ')
    return s

def read_data(path):
    df = pd.read_csv(path).drop_duplicates()
    df.columns = ['id', 'language', 'src_token', 'tgt_token', 'entities_dic']
    df['tgt_char'] = df.tgt_token.apply(replace_space)
    df['src_char'] = df.src_token.apply(replace_space)
    df['entities_dic'] = df.entities_dic.apply(eval)
    return df

def make_command(exp_path, encoder_level, decoder_level, steps, rnn):
    if rnn == 'lstm':
        model_name = "BiLSTM_{encoder_level}_LSTM_{decoder_level}".format( encoder_level=encoder_level, decoder_level=decoder_level)
    elif rnn == 'transformer':
        model_name = 'transformer_{encoder_level}'.format(encoder_level=encoder_level)
        
    with open("translate.sh", 'w') as f:
        command = "onmt_build_vocab --config {exp_path}/yaml/{model_name}_prep.yaml -n_sample -1".format(exp_path=exp_path, model_name=model_name)
        print(command)
        f.write(command)
        print()
        
        command = "onmt_train --config {exp_path}/yaml/{model_name}_train.yaml".format(exp_path=exp_path, model_name=model_name)
        print(command)
        f.write(command)
        print()
        
        command = "onmt_translate -model {exp_path}/{model_name}/model_step_{steps}.pt -src {exp_path}/data/src_test_{encoder_level}.txt -output {exp_path}/{model_name}/pred_{steps}.txt -gpu 0 -beam_size 5 -report_time".format(exp_path=exp_path, encoder_level=encoder_level, model_name=model_name, steps=steps)
        print(command)
        f.write(command)
        print()
    f.close()

def add_pred(df, exp_path, encoder_level, decoder_level, steps, rnn):
    if rnn == 'lstm':
        model_name = "BiLSTM_{encoder_level}_LSTM_{decoder_level}".format( encoder_level=encoder_level, decoder_level=decoder_level)
    elif rnn == 'transformer':
        model_name = 'transformer_{encoder_level}'.format(encoder_level=encoder_level)
        
    path = "{exp_path}/{model_name}/pred_{steps}.txt".format(
        exp_path=exp_path, 
        model_name=model_name, 
        steps=steps)
    
    data = pd.read_csv(path, sep="\n", header=None, skip_blank_lines=False)
    data = data.fillna('')
    data.columns = ["prediction"]

    df = df.reset_index(drop=True)
    if decoder_level == 'char':
        df['prediction_char'] = data["prediction"]
        df["prediction"] = data["prediction"].apply(recover_space)
    else:
        df['prediction_char'] = data["prediction"].apply(replace_space)
        df["prediction"] = data["prediction"]
    
    errors, matches, ref_length = [], [], []
    errors_char, matches_char, ref_length_char = [], [], []
    df['entity_errors'] = 0
    for index, row in df.iterrows():
        # token
        ref_line = row['tgt_token']
        hyp_line = row['prediction']
        ref = ref_line.split()
        hyp = hyp_line.split()
        sm = SequenceMatcher(a=ref, b=hyp)
        errors.append(get_error_count(sm))
        matches.append(get_match_count(sm))
        ref_length.append(len(ref))
        
        # char
        ref = row['tgt_char'].split()
        hyp = row['prediction_char'].split()
        sm = SequenceMatcher(a=ref, b=hyp)
        errors_char.append(get_error_count(sm))
        matches_char.append(get_match_count(sm))
        ref_length_char.append(len(ref))
        
        # entity
        df.loc[index, 'entity_errors'] = sum([not normalizeString(s) in hyp_line for s in row['entities_dic'].keys()])
        
    
    df['entity_count'] = df['entities_dic'].apply(len)
    
    df['token_errors'] = errors
    df['token_matches'] = matches
    df['token_length'] = ref_length
    
    df['char_errors'] = errors_char
    df['char_matches'] = matches_char
    df['char_length'] = ref_length_char
    
    df['sentence_count'] = 1
    df['sentence_error'] = 0
    df.loc[df['token_errors'] > 0, 'sentence_error'] = 1
    return df

def analyze(df, groupby, sort_col):
    count = df[[groupby, 'token_errors', 'token_length']].groupby(groupby).count()['token_length'].values
    meta_group = df[[groupby, 'char_errors', 'char_length', 'token_errors', 'token_length', 'sentence_error', 'sentence_count', 'entity_errors', 'entity_count']].groupby(groupby).sum()
    meta_group['wer'] = round(100 * meta_group.token_errors/meta_group.token_length, 2)
    meta_group['ser'] = round(100 * meta_group.sentence_error/meta_group.sentence_count, 2)
    meta_group['eer'] = round(100 * meta_group.entity_errors/meta_group.entity_count, 2)
    meta_group['cer'] = round(100 * meta_group.char_errors/meta_group.char_length, 2)
    meta_group = meta_group.reset_index().sort_values(sort_col, ascending=False)
    return meta_group.reset_index(drop=True)

def get_wer(df):
    return round(100 * sum(df.token_errors)/sum(df.token_length), 2)
def get_ser(df):
    return round(100 * sum(df.sentence_error)/sum(df.sentence_count), 2)
def get_eer(df):
    return round(100 * sum(df.entity_errors)/sum(df.entity_count), 2)
def get_cer(df):
    return round(100 * sum(df.char_errors)/sum(df.char_length), 2)

def print_errors(df, n, random_state=1):
    df = df[df.token_errors > 0][[ 'src_token', 'tgt_token', 'prediction']].sample(n=n, random_state=random_state)
    for src_line, ref_line, hyp_line in zip(df['src_token'].values, df['tgt_token'].values, df['prediction'].values):
        ref = ref_line.split()
        hyp = hyp_line.split()
        sm = SequenceMatcher(a=ref, b=hyp)
        print("SRC:", src_line)
        print_diff(sm, ref, hyp)
        print()
        
def read_data_json(path):
    df = pd.read_json(path)#.drop_duplicates()
    df.columns = ['id', 'language', 'src_token', 'tgt_token', 'entities_dic']
    df['tgt_char'] = df.tgt_token.apply(replace_space)
    df['src_char'] = df.src_token.apply(replace_space)
    #df['entities_dic'] = df.entities_dic.apply(eval)
    return df

In [None]:
# train = read_data("data/nmt_data/train_train.csv")
# test = read_data("data/nmt_data/test_test.csv")
# valid = read_data("data/nmt_data/valid_valid.csv")

train = read_data_json("data/nmt_data_json/train_train.json")
test = read_data_json("data/nmt_data_json/test_test.json")
valid = read_data_json("data/nmt_data_json/valid_valid.json")

In [None]:
# augment training
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
train, _= ros.fit_resample(train, train['language'])

In [None]:
# take specific lan
chosen_lan = 'fr'
train = train[train['language'] == chosen_lan]
test = test[test['language'] == chosen_lan]
valid = valid[valid['language'] == chosen_lan]

## Making dataset for ONMT

In [None]:
from tqdm import tqdm
from pathlib import Path

output_dir = '/mnt/workspace/project/exp/data/'
Path(output_dir).mkdir(parents=True, exist_ok=True)

for appendix in ['_char', '_token']:
    f_src_test = open(output_dir + 'src_test' + appendix + '.txt', "w")
    f_tgt_test = open(output_dir + 'tgt_test' + appendix + '.txt', "w")
    f_src_val = open(output_dir + 'src_val' + appendix + '.txt', "w")
    f_tgt_val = open(output_dir + 'tgt_val' + appendix + '.txt', "w")
    f_src_train = open(output_dir + 'src_train' + appendix + '.txt', "w")
    f_tgt_train = open(output_dir + 'tgt_train' + appendix + '.txt', "w")

    # generate
    for name, df, f_src, f_tgt in [('train', train, f_src_train, f_tgt_train), 
                     ('test', test, f_src_test, f_tgt_test), 
                     ('valid', valid, f_src_val, f_tgt_val)]:
        for _, row in tqdm(df.iterrows()):
            f_src.write("{}\n".format(row['src' + appendix]))
            f_tgt.write("{}\n".format(row['tgt'+ appendix]))

    f_src_val.close()
    f_tgt_val.close()
    f_src_test.close()
    f_tgt_test.close() 
    f_src_train.close()
    f_tgt_train.close()



In [None]:
# switch test dataset

chosen_lan = 'de'

from tqdm import tqdm
output_dir = '/mnt/workspace/project/exp/data/'
# test_new = pd.concat([test,valid])
t = read_data_json("data/nmt_data_json/test_train.json")
test_new = t[t['language'] == chosen_lan]

appendix = '_char'
f_src_test = open(output_dir + 'src_test' + appendix + '.txt', "w")
f_tgt_test = open(output_dir + 'tgt_test' + appendix + '.txt', "w")

# generate
for _, row in tqdm(test_new.iterrows()):
    f_src_test.write("{}\n".format(row['src' + appendix]))
    f_tgt_test.write("{}\n".format(row['tgt'+ appendix]))
f_src_test.close()
f_tgt_test.close() 

## Making prediction

python build_vocab.py --config /mnt/workspace/project/exp/yaml/BiLSTM_char_LSTM_char_train.yaml -n_sample -1

In [None]:
exp_path = '/mnt/workspace/project/exp'
encoder_level = 'char'
decoder_level = 'char'
rnn = 'lstm' #'transformer' 
steps = 100000
make_command(exp_path, encoder_level, decoder_level, steps, rnn)
##### run the printed command ####

onmt_translate -model /mnt/workspace/project/exp/BiLSTM_char_LSTM_char_de/model_step_100000.pt -src /mnt/workspace/project/exp/data/src_test_char.txt -output /mnt/workspace/project/exp/BiLSTM_char_LSTM_char/pred_100000.txt -gpu 0 -beam_size 5 -report_time

## Char2Char

In [None]:
exp_path = '/mnt/workspace/project/exp'
encoder_level = 'char'
decoder_level = 'char'
char2char = add_pred(test_new, exp_path, encoder_level, decoder_level, steps, rnn)
print("Overall WER: {}%".format(get_wer(char2char)))
print("Overall SER: {}%".format(get_ser(char2char)))
print("Overall EER: {}%".format(get_eer(char2char)))
print("Overall CER: {}%".format(get_cer(char2char)))
char2char.head()

In [None]:
exp_path = '/mnt/workspace/project/exp'
encoder_level = 'char'
decoder_level = 'char'
char2char = add_pred(test_new, exp_path, encoder_level, decoder_level, steps, rnn, plain=True)
print("Overall WER: {}%".format(get_wer(char2char)))
print("Overall SER: {}%".format(get_ser(char2char)))
print("Overall EER: {}%".format(get_eer(char2char)))
print("Overall CER: {}%".format(get_cer(char2char)))
char2char.head()

In [None]:
def add_pred(df, exp_path, encoder_level, decoder_level, steps, rnn, plain=False):
    if rnn == 'lstm':
        model_name = "BiLSTM_{encoder_level}_LSTM_{decoder_level}".format( encoder_level=encoder_level, decoder_level=decoder_level)
    elif rnn == 'transformer':
        model_name = 'transformer_{encoder_level}'.format(encoder_level=encoder_level)
        
    path = "{exp_path}/{model_name}/pred_{steps}.txt".format(
        exp_path=exp_path, 
        model_name=model_name, 
        steps=steps)
    
    if plain:
        data = df[['src_char']].reset_index(drop=True)
        data = data.fillna('')
    else:
        data = pd.read_csv(path, sep="\n", header=None, skip_blank_lines=False)
        data = data.fillna('')
    data.columns = ["prediction"]

    df = df.reset_index(drop=True)
    if decoder_level == 'char':
        df['prediction_char'] = data["prediction"]
        df["prediction"] = data["prediction"].apply(recover_space)
    else:
        df['prediction_char'] = data["prediction"].apply(replace_space)
        df["prediction"] = data["prediction"]
    
    errors, matches, ref_length = [], [], []
    errors_char, matches_char, ref_length_char = [], [], []
    df['entity_errors'] = 0
    for index, row in df.iterrows():
        # token
        ref_line = row['tgt_token']
        hyp_line = row['prediction']
        ref = ref_line.split()
        hyp = hyp_line.split()
        sm = SequenceMatcher(a=ref, b=hyp)
        errors.append(get_error_count(sm))
        matches.append(get_match_count(sm))
        ref_length.append(len(ref))
        
        # char
        ref = row['tgt_char'].split()
        hyp = row['prediction_char'].split()
        sm = SequenceMatcher(a=ref, b=hyp)
        errors_char.append(get_error_count(sm))
        matches_char.append(get_match_count(sm))
        ref_length_char.append(len(ref))
        
        # entity
        df.loc[index, 'entity_errors'] = sum([not normalizeString(s) in hyp_line for s in row['entities_dic'].keys()])
        
    
    df['entity_count'] = df['entities_dic'].apply(len)
    
    df['token_errors'] = errors
    df['token_matches'] = matches
    df['token_length'] = ref_length
    
    df['char_errors'] = errors_char
    df['char_matches'] = matches_char
    df['char_length'] = ref_length_char
    
    df['sentence_count'] = 1
    df['sentence_error'] = 0
    df.loc[df['token_errors'] > 0, 'sentence_error'] = 1
    return df

In [None]:
t = char2char[char2char['src_token'] != char2char['tgt_token']]

In [None]:
print("Overall WER: {}%".format(get_wer(t)))
print("Overall SER: {}%".format(get_ser(t)))
print("Overall EER: {}%".format(get_eer(t)))
print("Overall CER: {}%".format(get_cer(t)))

In [None]:
analyze(t,groupby='language', sort_col='ser').head(10)

### Group by id

In [None]:
analyze(char2char,groupby='id', sort_col='ser').head(10)

### Group by language

In [None]:
analyze(char2char,groupby='language', sort_col='language').head(10)

In [None]:
char2char[char2char['src_token'] =='turn on forty fist pumping power ballads']

In [None]:
# print_errors(char2char, n=30, random_state=3)
# print_errors(char2char[char2char['id'] == 'Tv.TvChannelChange.Init.Utterance'], n=10, random_state=4)
print_errors(char2char[char2char['language'] == 'en'], n=10, random_state=5)

## Token2Char

In [None]:
encoder_level = 'token'
decoder_level = 'char'
token2char = add_pred(test, exp_path, encoder_level, decoder_level, steps)
print("Overall WER: {}%".format(get_wer(token2char)))
print("Overall SER: {}%".format(get_ser(token2char)))
token2char.head()

In [None]:
analyze_by_id(token2char, 'wer')#.head(10)

## Token2Token

In [None]:
encoder_level = 'token'
decoder_level = 'token'
token2token = add_pred(test, exp_path, encoder_level, decoder_level, steps)
print("Overall WER: {}%".format(get_wer(token2token)))
print("Overall SER: {}%".format(get_ser(token2token)))
token2token.head()

In [None]:
analyze_by_id(token2token, 'wer')#.head(10)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

df = analyze_by_id(char2char, 'wer')
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(6, 15))

# Plot the total crashes
sns.set_color_codes("pastel")
sns.barplot(x="token_length", y="id", data=df,
            label="Total", color="b")

# Plot the crashes where alcohol was involved
sns.set_color_codes("muted")
sns.barplot(x="token_errors", y="id", data=df,
            label="errors", color="b")

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="",
       xlabel="Token_errors") #xlim=(0, 2000), 
# ax.set_xscale('log')
sns.despine(left=True, bottom=True)

In [None]:
from asr_evaluation.asr_evaluation import *

from functools import reduce
from collections import defaultdict
from edit_distance import SequenceMatcher

from termcolor import colored
def process_line_pair(ref_line, hyp_line, case_insensitive=False, remove_empty_refs=False):
    """Given a pair of strings corresponding to a reference and hypothesis,
    compute the edit distance, print if desired, and keep track of results
    in global variables.
    Return true if the pair was counted, false if the pair was not counted due
    to an empty reference string."""
    # I don't believe these all need to be global.  In any case, they shouldn't be.
    global error_count
    global match_count
    global ref_token_count
    global sent_error_count

    # Split into tokens by whitespace
    ref = ref_line.split()
    hyp = hyp_line.split()
    id_ = None

    # Create an object to get the edit distance, and then retrieve the
    # relevant counts that we need.
    sm = SequenceMatcher(a=ref, b=hyp)
    errors = get_error_count(sm)
    matches = get_match_count(sm)
    ref_length = len(ref)

    # Increment the total counts we're tracking
    error_count += errors
    match_count += matches
    ref_token_count += ref_length

    if errors != 0:
        sent_error_count += 1

    # If we're printing instances, do it here (in roughly the align.c format)
    if print_instances_p or (print_errors_p and errors != 0):
        print_instances(ref, hyp, sm, id_=id_)

    # Keep track of the individual error rates, and reference lengths, so we
    # can compute average WERs by sentence length
    lengths.append(ref_length)
    if len(ref) > 0:
        error_rate = errors * 1.0 / len(ref)
    else:
        error_rate = float("inf")
    error_rates.append(error_rate)
    wer_bins[len(ref)].append(error_rate)
    return True


In [None]:
df = pd.DataFrame(columns=['errors', 'matches', 'length'])
for ref_line, hyp_line in zip(test.tgt_token.values, test.prediction_char2char.values):
    ref = ref_line.split()
    hyp = hyp_line.split()
    sm = SequenceMatcher(a=ref, b=hyp)
    errors = get_error_count(sm)
    matches = get_match_count(sm)
    ref_length = len(ref)
    df = df.append({'errors': errors, 'matches':matches, 'length':ref_length}, ignore_index=True
)
df

In [None]:
print_instances(ref, hyp, sm)

In [None]:
counter = 0
ref_token_count = 0
error_count = 0
match_count = 0
counter = 0
sent_error_count = 0
print_instances_p = False
print_errors_p = True

# Loop through each line of the reference and hyp file
for ref_line, hyp_line in zip(test.tgt_token.values, test.prediction_char2char.values):
    processed_p = process_line_pair(ref_line, hyp_line)
    if processed_p:
        counter += 1
if ref_token_count > 0:
    wrr = match_count / ref_token_count
    wer = error_count / ref_token_count
else:
    wrr = 0.0
    wer = 0.0
# Compute SER
if counter > 0:
    ser = sent_error_count / counter
else:
    ser = 0.0
print('Sentence count: {}'.format(counter))
print('WER: {:10.3%} ({:10d} / {:10d})'.format(wer, error_count, ref_token_count))
print('WRR: {:10.3%} ({:10d} / {:10d})'.format(wrr, match_count, ref_token_count))
print('SER: {:10.3%} ({:10d} / {:10d})'.format(ser, sent_error_count, counter))


In [None]:
asr_evaluation.asr_evaluation

In [None]:
random.seed(30)
li = random.choices(data.src.unique(), k = int(len(data.src.unique()) *0.05 ))
train_df = data[~data.src.isin(li)]
test_df = data[data.src.isin(li)].drop_duplicates()

print("test size: {} unique sentences".format(len(li)))
print("train size: {} sample senteces (with duplication)".format(sum(~data.src.isin(li))) )

In [None]:
from tqdm import tqdm
mid = len(test_df) // 2
output_dir = '/home/zhechensu/exp/'
f_src_test = open(output_dir + 'src_test_char.txt', "w")
# f_tgt_test = open(output_dir + 'tgt_test_char.txt', "w")
f_src_val = open(output_dir + 'src_val_char.txt', "w")
# f_tgt_val = open(output_dir + 'tgt_val_char.txt', "w")
f_src_train = open(output_dir + 'src_train_char.txt', "w")
# f_tgt_train = open(output_dir + 'tgt_train_char.txt', "w")

# generate
for index, row in tqdm(test_df.iterrows()):
    if index % 2:
        f_src_test.write("{}\n".format(row['src']))
#         f_tgt_test.write("{}\n".format(row['tgt']))
    else:
        f_src_val.write("{}\n".format(row['src']))
#         f_tgt_val.write("{}\n".format(row['tgt']))
f_src_val.close()
# f_tgt_val.close()
f_src_test.close()
# f_tgt_test.close()
        
for index, row in tqdm(train_df.iterrows()):
    f_src_train.write("{}\n".format(row['src']))
#     f_tgt_train.write("{}\n".format(row['tgt']))   
f_src_train.close()
# f_tgt_train.close()



In [None]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([a-zA-Z]+)[\:\-]", r"\1 ", s)  # colun:dsfa -> colun dsfa
    s = re.sub(r"(\d+)[\:](\d+)", r"\1 : \2", s) # 15:30 -> 15:30
    s = re.sub(r"([\.\+])", r" \1 ", s)
    s = re.sub(r"([\-])", r" ", s)
#     s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s

In [None]:
def readLangs(lang1, lang2, lang_class=Lang):
    print("Reading lines...")

    # Split every line into pairs and normalize
    pairs = [[s, t]for s, t in list(zip(lang1, lang2))] 

    input_lang = lang_class('src')
    output_lang = lang_class('tgt')

    return input_lang, output_lang, pairs

In [None]:
def prepareData(lang1, lang2, lang_class=Lang):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, lang_class)
    print("Read %s sentence pairs" % len(pairs))
#     pairs = filterPairs(pairs)
#     print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData(train_df.src.tolist(), train_df.tgt.tolist())
print(random.choice(pairs))

In [None]:
from attentionRNN import DynamicEncoder, BahdanauAttnDecoderRNN

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size, bidirectional=True, num_layers=2)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = #nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=30):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [None]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=30):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
# plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=30, show_topk=0):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, data, n=10):
    for i in range(n):
        pair = random.choice(data)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 50000, print_every=3000, learning_rate=0.01)

In [None]:
test_pairs = [[s,  t]for s, t in list(zip(test_df.drop_duplicates().src, test_df.drop_duplicates().tgt))]
evaluateRandomly(encoder1, attn_decoder1, test_pairs)

In [None]:
# evaluate in validation set
from jiwer import wer
from tqdm import tqdm 

def evaluateWER(encoder, decoder, n=10):
    WER = 0
    size = 0
    for test_pair in tqdm(test_pairs):
        try:
            output_words, attentions = evaluate(encoder, decoder, test_pair[0])
            output_sentence = ' '.join(output_words[:-1])
            WER += wer(test_pair[1], output_sentence)
            size += 1
        except:
            pass
    print("WER: {}".format(WER/ size))
        
wer_base = wer(test_df.drop_duplicates().src.tolist(), test_df.drop_duplicates().tgt.tolist())
print("WER_base: {}".format(wer_base))
evaluateWER(encoder1, attn_decoder1)

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)
    
evaluateAndShowAttention("is it going to be nice outside at seventeen fifteen in milan")
evaluateAndShowAttention("tell me something about real betis sevilla")
evaluateAndShowAttention("is it raining on fifteenth september")
evaluateAndShowAttention("four hundred and twenty eight")
evaluateAndShowAttention("is it raining in plateau from seven twenty five to three fifteen")

## GloVe

In [None]:
import bcolz
import pickle
# words = []
# idx = 0
# word2idx = {}
glove_path = "/home/zhechensu/glove/glove.6B.50d.txt"
rootdir = f'/home/zhechensu/glove/glove.6B/'
# vectors = bcolz.carray(np.zeros(1), rootdir=rootdir + '6B.50.dat', mode='w')

# with open(glove_path, 'rb') as f:
#     for l in f:
#         line = l.decode().split()
#         word = line[0]
#         words.append(word)
#         word2idx[word] = idx
#         idx += 1
#         vect = np.array(line[1:]).astype(np.float)
#         vectors.append(vect)
     
# vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir= rootdir + '6B.50.dat', mode='w')
# vectors.flush()
# pickle.dump(words, open(rootdir + '6B.50_words.pkl', 'wb'))
# pickle.dump(word2idx, open(rootdir + '6B.50_idx.pkl', 'wb'))

In [None]:
# Load Glove Vectors
vectors = bcolz.open(rootdir + '6B.50.dat')[:]
words = pickle.load(open(rootdir + '6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(rootdir + '6B.50_idx.pkl', 'rb'))
 
glove = {w: vectors[word2idx[w]] for w in words}

In [None]:
# for moving 'sos' token at index 0 and 'eos' token at index 1

sos_index = word2idx['sos']
eos_index = word2idx['eos']
sos_swap_word = words[0]
eos_swap_word = words[1]
 
words[0], words[sos_index] = words[sos_index], words[0]
words[1], words[eos_index] = words[eos_index], words[1]
word2idx[sos_swap_word], word2idx['sos'] = word2idx['sos'], word2idx[sos_swap_word]
word2idx[eos_swap_word], word2idx['eos'] = word2idx['eos'], word2idx[eos_swap_word]


In [None]:
# Sort word2idx
import operator
word2idx = { k : v for k , v in sorted(word2idx.items(), key=operator.itemgetter(1))}


In [None]:
class LangGlove:
    def __init__(self, name):
        self.name = name
        self.word2index = { k : v for k , v in sorted(word2idx.items(), key=operator.itemgetter(1))}
        self.word2count = { word : 1 for word in words }
        self.index2word = { i : word for word, i in word2idx.items() }
        self.n_words = 400001
 
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)
 
    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
input_lang, output_lang, pairs = prepareData(train_df.src.tolist(), train_df.tgt.tolist(), LangGlove)
print(random.choice(pairs))

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 60000, print_every=3000, learning_rate=0.01)

In [None]:
evaluateWER(encoder1, attn_decoder1)

In [None]:
evaluateRandomly(encoder1, attn_decoder1, test_pairs)

In [None]:
import torch.nn.functional as F
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)
    
evaluateAndShowAttention("is it going to be nice outside at seventeen fifteen in milan")
evaluateAndShowAttention("tell me something about real betis sevilla")
evaluateAndShowAttention("is it raining on fifteenth september")
evaluateAndShowAttention("four hundred and twenty eight")
evaluateAndShowAttention("is it raining in plateau from seven twenty five to three fifteen")
# evaluateAndShowAttention("i am looking for star wars  revenge of the sit")

In [None]:
import pickle
def save_variable(v,filename):
    f=open(filename,'wb')
    pickle.dump(v,f)
    f.close()
    return filename
 
def load_variable(filename):
    f=open(filename,'rb')
    r=pickle.load(f)
    f.close()
    return r

save_variable(encoder1, 'models/s2s_250k+glove_encoder.pkl') 
save_variable(attn_decoder1, 'models/s2s_250k+glove_decoder.pkl') 

Evaluation for 70K model

In [None]:
name = '60k+1k+glove' #{'70k', '60k+1k', '60k+1k+glove'}
encoder1 = load_variable('models/s2s_{}_encoder.pkl'.format(name))
attn_decoder1 = load_variable('models/s2s_{}_decoder.pkl'.format(name))
# evaluateRandomly(a, b)