# Sequence2Sequence experiments

Seguindo a ideia em https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html cada linha do código fonte é alinhada com sua possível refatoração:

import example            no
//comment                 no
public class HelloWorld { no
                          no
public void method1() {   no
System.out.print("");     no
}                         no
                          no
public void method2() {   Rename_Method
System.out.print("");     no
}                         no
}                         no

Como um código fonte deve ser uma sequência a ser mapeada para outra sequencia (rótulos das refatorações), ele é tokenizado e linearizado. Para manter a indicação das linhas, o token "NEWLINE" é adicionado a cada quebra de linha:

```import example NEWLINE //comment NEWLINE public class HelloWorld { NEWLINE public void method1 ( ) { NEWLINE System . out . print (" ") ; NEWLINE } NEWLINE  NEWLINE public void method2 ( ) { NEWLINE System . out . print (" ") ; NEWLINE } NEWLINE } NEWLINE }```

que será mapeado para 

```no NEWLINE no NEWLINE no NEWLINE no NEWLINE no NEWLINE no NEWLINE no NEWLINE no NEWLINE Rename_Method NEWLINE no NEWLINE no NEWLINE no NEWLINE``` 

In [None]:
import pandas as pd
import numpy as np
import nltk

In [None]:
df_yes_method = pd.read_csv('../data/root/output/commons-csv/yes-method.csv')
df_targets = pd.read_csv('../data/root/output/commons-csv/yes.csv')

df_yes_method["refactoring"] = None

In [None]:
def get_refactoring_row(row):
    return df_targets[(df_targets.dataset == row["dataset"]) & (df_targets.project == row["project"]) &\
                      (df_targets.parentCommit == row["refactorCommit"]) & (df_targets.method == row["method"])]

def tokenize_line(line):
    return ' '.join(nltk.tokenize.wordpunct_tokenize(line))

def get_file_lines(row):
    with open('../data/root/output/{}/storage/{}/before-refactoring/{}'.format(row["project"].replace(".git", ""),
                                                                           row["refactorCommit"],
                                                                           row["path"]), 'r') as text_file:
        lines = text_file.readlines()
        lines = [tokenize_line(line) for line in lines]
        return lines

def label_file(row, label):
    hash_label = "{}-{}-{}".format(row["project"], row["refactorCommit"], row["path"])
    if hash_label not in trainset:
        lines = get_file_lines(row)
        labels = ['no' for line in lines]
    else:
        lines = trainset[hash_label]['lines']
        labels = trainset[hash_label]['labels']
    try:
        labels[row['line'] - 1] = label
    except:
        print(labels)
    trainset[hash_label] = {'lines': lines, 'labels': labels}        

In [None]:
trainset = {} # hash_label = project-parentCommit-path

In [None]:
for index, row in df_yes_method.iterrows():
    if 'Test.java' not in row["path"]:
        refactoring_row = get_refactoring_row(row)
        if len(refactoring_row["refactoring"]) > 0:
            label = list(refactoring_row["refactoring"])[0]
            label_file(row, label.replace(" ","_"))

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
import numpy as np

In [None]:
input_source_codes = []
target_source_codes = []
input_tokens = set()
target_tokens = set()
linear_source_codes = []
linear_targets = []

for hash_label, data in trainset.items():
    input_source_codes.append(data['lines'])
    target_source_codes.append(data['labels'])
    for source_code_lines in input_source_codes:
        linear_source_code = ' NEWLINE '.join(source_code_lines)
        linear_source_codes.append(linear_source_code)
        tokens = linear_source_code.split()
        for token in tokens:
            if token not in input_tokens:
                input_tokens.add(token)
    for target_list in target_source_codes:
        linear_target = ' NEWLINE '.join(target_list)
        linear_targets.append(linear_target)
        tokens = linear_target.split()
        for token in tokens:
            if token not in target_tokens:
                target_tokens.add(token)

In [None]:
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(txt) for txt in linear_source_codes])
max_decoder_seq_length = max([len(txt) for txt in linear_targets])

print('Number of samples:', len(linear_source_codes))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

In [None]:
input_token_index = dict(
    [(char, i) for i, char in enumerate(input_tokens)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(target_tokens)])

encoder_input_data = np.zeros(
    (len(linear_source_codes), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(linear_source_codes), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(linear_source_codes), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

In [None]:
target_token_index

In [None]:
for i, (linear_source_code, linear_target) in enumerate(zip(linear_source_codes, linear_targets)):
    for t, token in enumerate(linear_source_code.split()):
        encoder_input_data[i, t, input_token_index[token]] = 1.
    for t, token in enumerate(linear_target.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t, target_token_index[token]] = 1.
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[token]] = 1.

In [None]:
batch_size = 5  # Batch size for training.
epochs = 10  # Number of epochs to train for.
latent_dim = 128  # Latent dimensionality of the encoding space.

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Run training
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)
# Save model
model.save('s2s.h5')

# inference mode

In [None]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_token_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_token_index = dict(
    (i, char) for char, i in target_token_index.items())

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index['NEWLINE']] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_token_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence


for seq_index in range(2):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', linear_source_codes[seq_index])
    print('Decoded sentence:', decoded_sentence)