# Attention layers

In [None]:
import numpy as np
from utils.activations import softmax_forward_npdl, softmax_backward_npdl
from layers.Attention import SelfAttention, MultiHeadAttention
from utils.model_loss import cross_entropy_loss_npdl

In [None]:
Q = np.array([[[0.1,0.2,0.1,0.5], [0,0,0,0], [0.1,0.2,0.3,0.2]], [[0.2,0.3,0.1,0.1], [0.1,0.1,0.1,0.1], [0.1,0.2,0.3,0.2]], [[0,0,0,0], [0.2,0.1,0.1,1.5], [0.1,0.2,0.3,0.2]]])
K = np.array([[[0.1,0.2,0.1,0.5], [0,0,0,0], [0.1,0.2,0.3,0.2]], [[0.2,0.3,0.1,0.1], [0.1,0.1,0.1,0.1], [0.1,0.2,0.3,0.2]], [[0,0,0,0], [0.2,0.1,0.1,1.5], [0.1,0.2,0.3,0.2]]])
V = np.array([[[0.1,0.2,0.1,0.5], [0,0,0,0], [0.1,0.2,0.3,0.2]], [[0.2,0.3,0.1,0.1], [0.1,0.1,0.1,0.1], [0.1,0.2,0.3,0.2]], [[0,0,0,0], [0.2,0.1,0.1,1.5], [0.1,0.2,0.3,0.2]]]) 

mask = np.array([[[1],[0],[1]],[[1],[1],[1]],[[0],[1],[1]]])

In [None]:
np.random.seed(0)
layer = SelfAttention(4, 3, 3, weight_scale=None)
np.random.seed(0)
A = layer.forward_npdl(Q, K, V, output_mask=mask)

In [None]:
np.random.seed(0)
dA = np.random.randn(3, 3, 3)
np.random.seed(0)
dQ_p, dK_p, dV_p = layer.backward_npdl(dA)

In [None]:
layer.dWV

In [None]:
layer.WV

In [None]:
np.random.seed(0)
multi = MultiHeadAttention(4, 4, weight_scale=None)
a = np.random.randn(3,2,4)
print(a)

In [None]:
out = multi.forward_npdl(a, a, a)
print(out)

In [None]:
print(multi.backward_npdl(out))

# Encoder

In [None]:
from model.Encoder import Encoder

In [None]:
X = np.array([[[0.1,0.2,0.1,0.5], [0,0,0,0], [0.1,0.2,0.3,0.2]], [[0.2,0.3,0.1,0.1], [0.1,0.1,0.1,0.1], [0.1,0.2,0.3,0.2]], [[0,0,0,0], [0.2,0.1,0.1,1.5], [0.1,0.2,0.3,0.2]]])
mask = np.array([[[1],[0],[1]],[[1],[1],[1]],[[0],[1],[1]]])

In [None]:
encoder = Encoder(4, 8)

In [None]:
A = encoder.forward_npdl(X, output_mask=mask)
print(A)

In [None]:
dA = np.random.rand(*(A.shape))
dX = encoder.backward_npdl(dA, output_mask=mask)
print(dX)

# Transformer encoder

In [None]:
from model.Transformer import TransformerEncoder
from utils.model_loss import cross_entropy_loss_npdl

In [None]:
X = np.array([[[0.1,0.2,0.1,0.5], [0,0,0,0], [0.1,0.2,0.3,0.2]], [[0.2,0.3,0.1,0.1], [0.1,0.1,0.1,0.1], [0.1,0.2,0.3,0.2]], [[0,0,0,0], [0.2,0.1,0.1,1.5], [0.1,0.2,0.3,0.2]]])
t = np.array([1, 0, 2])

In [None]:
model = TransformerEncoder(2, 3, 4, 8, 3)
model.add_loss(cross_entropy_loss_npdl)

In [None]:
A = model.forward_npdl(X)
print(A)

In [None]:
dA = np.random.rand(*A.shape)
print(dA)

In [None]:
dX = model.backward_npdl(dA)
print(dX)

In [None]:
params = model.parameters()

In [None]:
loss, dScores, softmax_output = model.calculate_loss(A, t, 0.0)

# Sentiment analysis

In [None]:
import numpy as np
import pandas as pd
from bpemb import BPEmb

In [None]:
# Dataset: Stanford Sentiment Treebank V1.0
dictionary = pd.read_csv('datasets/stanfordSentimentTreebank/dictionary.txt', header=None, sep='|')
dictionary = dictionary.rename(columns={0:'phrase', 1:'phrase_id'})

dataset_split = pd.read_csv('datasets/stanfordSentimentTreebank/datasetSplit.txt', sep=',')

dataset_sentences = pd.read_csv('datasets/stanfordSentimentTreebank/datasetSentences.txt', sep='\t')

dataset_labels = pd.read_csv('datasets/stanfordSentimentTreebank/sentiment_labels.txt', sep='|')
dataset_labels = dataset_labels.rename(columns={'phrase ids':'phrase_id', 'sentiment values':'sentiment'})

In [None]:
# Obtenir les phrase_id des sentence
sentences_merged = dataset_sentences.merge(dictionary, left_on='sentence', right_on='phrase', how='left').drop(columns=['phrase'])

# Retirer les sentence qui n'ont pas de phrase_id
sentences_clean = sentences_merged[~sentences_merged.phrase_id.isnull()]

In [None]:
# obtenir les labels
sentences_with_labels = sentences_clean.merge(dataset_labels, on='phrase_id', how='left').drop(columns=['phrase_id'])

In [None]:
# separation train - valid - test
sentences_split = sentences_with_labels.merge(dataset_split, on='sentence_index')

# Embeddings

In [None]:
bpemb_en = BPEmb(lang="en", dim=50, vs=100000)

In [None]:
def call_embed(value, embedder, max_length):
    emb = embedder.embed(value)
    return np.pad(emb, ((0, max_length - emb.shape[0]), (0, 0)), 'constant', constant_values=(0))

def get_longest(value, embedder):
    emb = embedder.embed(value)
    return emb.shape[0]

def convert_sentiment(value):
    if value <= 0.4:
        return 0
    if value <= 0.6:
        return 1
    return 2
    

sentences_split['len'] = sentences_split.apply(lambda x: get_longest(x['sentence'], bpemb_en), axis=1)

max_len = sentences_split.len.max()
print(max_len)

sentences_split['embedding'] = sentences_split.apply(lambda x: call_embed(x['sentence'], bpemb_en, max_len), axis=1)
sentences_split['sentiment_label'] = sentences_split.apply(lambda x: convert_sentiment(x['sentiment']), axis=1)

In [None]:
train = sentences_split[sentences_split.splitset_label == 1].drop(columns=['splitset_label'])
valid = sentences_split[sentences_split.splitset_label == 2].drop(columns=['splitset_label'])
test = sentences_split[sentences_split.splitset_label == 3].drop(columns=['splitset_label'])

In [None]:
train_data = np.array(train.embedding.tolist())
valid_data = np.array(valid.embedding.tolist())
test_data = np.array(test.embedding.tolist())

train_labels = np.array(train.sentiment_label.tolist())
valid_labels = np.array(valid.sentiment_label.tolist())
test_labels = np.array(test.sentiment_label.tolist())

train_data = np.concatenate((train_data, test_data), axis=0)
train_labels = np.concatenate((train_labels, test_labels), axis=0)

# Transformer encoder model

In [None]:
from model.Transformer import TransformerEncoder
from utils.model_loss import cross_entropy_loss_npdl
from model.Solver import check_accuracy

def create_transformer_network():
    model = TransformerEncoder(2, 58, 50, 100, 3, num_heads=5)
    model.add_loss(cross_entropy_loss_npdl)
    return model

In [None]:
model = create_transformer_network()

In [None]:
predictions = model.predict(test_data[:32])
print(predictions)

In [None]:
scores = model.forward_npdl(train_data[:32])
loss, dScores, softmax_output = model.calculate_loss(scores, train_labels[:32], 0.0)
print(loss)

In [None]:
train_accuracy = check_accuracy(train_data, train_labels, 16, model)
val_accuracy = check_accuracy(valid_data, valid_labels, 16, model)
print('Initial training accuracy: ' + str(train_accuracy))
print('Initial validation accuracy: ' + str(val_accuracy))

In [None]:
from model.Solver import epoch_solver_npdl, Adam, SGD

model = create_transformer_network()

optimizer = Adam(1e-3, model)

loss_history, train_accuracy_history, val_accuracy_history = epoch_solver_npdl(train_data, 
                                                                          train_labels,
                                                                          valid_data,
                                                                          valid_labels,
                                                                          2e-3,
                                                                          optimizer,
                                                                          lr_decay=0.99,
                                                                          batch_size=16,
                                                                          epochs=10)

# Seq2seq

In [None]:
import numpy as np
import pandas as pd
from bpemb import BPEmb
from model.Solver import epoch_solver_seq2seq

In [None]:
# Dataset: Twitter customer support
dataset = pd.read_csv('datasets/twcs/twcs.csv').drop(columns=['author_id', 'inbound', 'created_at', 'in_response_to_tweet_id'])
dataset = dataset.set_index('tweet_id')
dataset_responses = dataset

In [None]:
# Remove tweets with NaN response_tweet_id from dataset
dataset = dataset[~dataset.response_tweet_id.isnull()]

In [None]:
# If multiple responses to one tweet, only keep the first
def split_fct(x):
    if ',' in str(x):
        return int(str(x).split(',')[0])
    
    return int(x)

dataset['response_tweet_id'] = dataset['response_tweet_id'].apply(lambda x: split_fct(x))

In [None]:
# Merge responses with questions
pair_dataset = dataset.merge(dataset_responses, left_on='response_tweet_id', right_on='tweet_id', how='left').drop(columns=['response_tweet_id_x', 'response_tweet_id_y'])

In [None]:
# Remove samples with null text
pair_dataset = pair_dataset[~pair_dataset.text_x.isnull()]
pair_dataset = pair_dataset[~pair_dataset.text_y.isnull()]

In [None]:
# Remove @ mentions from text
def remove_mentions(x):
    return ' '.join(filter(lambda z: z[0] != '@', x.split()))

pair_dataset['text_x'] = pair_dataset['text_x'].apply(lambda x: remove_mentions(x))
pair_dataset['text_y'] = pair_dataset['text_y'].apply(lambda x: remove_mentions(x))

In [None]:
# Reduce dataset size to 5k (for speed purposes)
pair_dataset = pair_dataset[:5000]

# Embeddings (seq2seq)

In [None]:
bpemb_en = BPEmb(lang="en", dim=25, vs=25000, preprocess=True)

In [None]:
def call_embed(value, embedder, max_length):
    emb = embedder.embed(value)
    return np.pad(emb, ((0, max_length - emb.shape[0]), (0, 0)), 'constant', constant_values=(0))

def call_embed_target(value, embedder, max_length):
    ids = embedder.encode_ids_with_bos_eos(value)
    emb = embedder.emb.vectors[ids]
    return np.pad(emb, ((0, max_length - len(emb)), (0, 0)), 'constant', constant_values=(0))

def tokenize(value, embedder, max_length):
    tokens = bpemb_en.encode_ids_with_eos(value)
    return np.pad(tokens, (0, max_length - len(tokens)), 'constant', constant_values=(1)).astype(int)

def get_longest(value, embedder):
    emb = embedder.embed(value)
    return emb.shape[0]

def get_longest_target(value, embedder):
    ids = embedder.encode_ids_with_bos_eos(value)
    emb = embedder.emb.vectors[ids]
    return emb.shape[0]
    

pair_dataset['len_x'] = pair_dataset.apply(lambda x: get_longest(x['text_x'], bpemb_en), axis=1)
pair_dataset['len_y'] = pair_dataset.apply(lambda x: get_longest_target(x['text_y'], bpemb_en), axis=1)

max_len = max(pair_dataset.len_x.max(), pair_dataset.len_y.max())
print(max_len)

pair_dataset['embedding_x'] = pair_dataset.apply(lambda x: call_embed(x['text_x'], bpemb_en, max_len), axis=1)
pair_dataset['embedding_y'] = pair_dataset.apply(lambda x: call_embed_target(x['text_y'], bpemb_en, max_len), axis=1)

pair_dataset['token_id_y'] = pair_dataset.apply(lambda x: tokenize(x['text_y'], bpemb_en, max_len), axis=1)

In [None]:
pair_dataset

In [None]:
train = pair_dataset[:4000]
valid = pair_dataset[4000:5000]

train_data = np.array(train.embedding_x.tolist())
valid_data = np.array(valid.embedding_x.tolist())

train_targets = np.array(train.embedding_y.tolist())
valid_targets = np.array(valid.embedding_y.tolist())

train_labels = np.array(train.token_id_y.tolist())
valid_labels = np.array(valid.token_id_y.tolist())

In [None]:
print(train_data.shape)
print(train_targets.shape)
print(train_labels.shape)

In [None]:
del pair_dataset

# Transformer model (seq2seq)

In [None]:
from model.Transformer import Transformer
from utils.model_loss import td_cross_entropy_loss_npdl

def create_transformer_network():
    model = Transformer(2, 99, 25, 50, 25000, num_heads=5)
    model.add_loss(td_cross_entropy_loss_npdl)
    return model

In [None]:
model = create_transformer_network()

In [None]:
scores = model.forward_npdl(train_data[:2], train_targets[:2])
loss, dScores, softmax_output = model.calculate_loss(scores, train_labels[:2], 0.0)
print(loss)

In [None]:
dScores.shape

In [None]:
dA, dE = model.backward_npdl(dScores)

In [None]:
from model.Solver import epoch_solver_seq2seq, Adam, SGD

model = create_transformer_network()

optimizer = Adam(1e-3, model)

loss_history, train_accuracy_history, val_accuracy_history = epoch_solver_seq2seq(train_data,
                                                                                  train_targets,
                                                                                  train_labels,
                                                                                  valid_data,
                                                                                  valid_labels,
                                                                                  0.0,
                                                                                  optimizer,
                                                                                  lr_decay=0.99,
                                                                                  batch_size=16,
                                                                                  epochs=4)

In [None]:
from visualization.utils import visualize_loss
visualize_loss(loss_history)

In [None]:
# On voit que l'entraînement aurait encore beaucoup de chemin à faire et que le modèle
# n'a probablement pas assez de capacité pour cette tâche.
scores = model.forward_npdl(train_data[1].reshape(1, 99, 25), train_targets[1].reshape(1, 99, 25))
np.argmax(scores, axis=2)

In [None]:
model.predict(train_data[1].reshape(1, 99, 25), bpemb_en, 1, 2)