# NLP Project Team 4

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 500)
from sklearn.model_selection import train_test_split
from ktext.preprocess import processor

# Read Data And Preview
## To spilt into train and test sets

In [None]:
traindf, testdf = train_test_split(pd.read_csv('../input/github_issues.csv').sample(n=40000), 
                                   test_size=.10)
train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()
traindf.head()

## Isssue Body and Title are stored in seperate lists using tolist.  The following code shows us the first issue title entry in the list:

In [None]:
# Preview what is in this list
train_title_raw[0]

## Use `ktext` to pre-process data

In [None]:
num_encoder_tokens = 10000
body_pp = processor(keep_n=num_encoder_tokens, padding_maxlen=50)
train_body_vecs = body_pp.fit_transform(train_body_raw)

## An example of processed issue bodies

In [None]:
print('\noriginal string:\n', train_body_raw[0], '\n')
print('after pre-processing:\n', train_body_vecs[0], '\n')

In [None]:
# Instantiate a text processor for the titles, with some different parameters
# append_indicators = True appends the tokens '_start_' and '_end_' to each document
# padding = 'post' means that zero padding is appended to the end of the of the document (default is 'pre')

num_decoder_tokens=9000
title_pp = processor(append_indicators=True, keep_n=num_decoder_tokens, 
                     padding_maxlen=12, padding ='post')

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

In [None]:
max(title_pp.id2token.keys())

# Create the encoder decoder model

In [None]:
def load_encoder_inputs(vectorized_body):
    encoder_input_data = vectorized_body
    doc_length = encoder_input_data.shape[1]
    print(f'Shape of encoder input: {encoder_input_data.shape}')
    return encoder_input_data, doc_length


def load_decoder_inputs(vectorized_title):
    # For Decoder Input, you don't need the last word as that is only for predictionwhen we are training using Teacher Forcing.
    decoder_input_data = vectorized_title[:, :-1]

    # Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
    decoder_target_data = vectorized_title[:, 1:]

    print(f'Shape of decoder input: {decoder_input_data.shape}')
    print(f'Shape of decoder target: {decoder_target_data.shape}')
    return decoder_input_data, decoder_target_data

In [None]:
import numpy as np
encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs)
decoder_input_data, decoder_target_data = load_decoder_inputs(train_title_vecs)
num_encoder_tokens = max(body_pp.id2token.keys()) + 1
num_decoder_tokens = max(title_pp.id2token.keys()) + 1

In [None]:
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding, Bidirectional, BatchNormalization
from keras import optimizers

In [None]:
#setting latent dimensions arbitarily for embedding and hidden units
latent_dim = 80

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (ex: Issue Body)
x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

# We do not need the `encoder_output` just the hidden state.
_, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just encode without decoding if we want to.
encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (ex: Issue Titles)
dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = Dense(num_decoder_tokens+2, activation='softmax', name='Final-Output-Dense')
#softmax is a mathematical exponential function to calculate the probability distribution
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy') 
#sparse_categorical_crossentropy is used to calculate probabilistic loss between label and predictions given word embeddimgs

# Training the model

In [None]:
from keras.callbacks import CSVLogger, ModelCheckpoint

script_name_base = 'tutorial_seq2seq'
model_checkpoint = ModelCheckpoint('{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(script_name_base),
                                   save_best_only=True)

batch_size = 100
epochs = 4
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.10, callbacks=[model_checkpoint])

# Inference model

In [None]:
!pip install annoy
from annoy import AnnoyIndex
from tqdm import tqdm
import logging
import nltk
from nltk.translate.bleu_score import corpus_bleu

In [None]:
def extract_decoder_model(model):
    """
    Here we extract the decoder from the original model.
    Inputs: keras model object
    Outputs: A Keras model object with the following inputs and outputs:
    Inputs of Keras Model That Is Returned:
    1: the embedding index for the last predicted word or the <Start> indicator
    2: the last hidden state
    Outputs of Keras Model That Is Returned:
    1.  Prediction (class probabilities) for the next word
    2.  The hidden state of the decoder, to be fed back into the decoder at the next time step
    
    """
    # the latent dimension is the same so we copy it from the decoder output
    latent_dim = model.get_layer('Decoder-Word-Embedding').output_shape[-1]

    # Reconstruct the input into the decoder
    decoder_inputs = model.get_layer('Decoder-Input').input
    dec_emb = model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
    dec_bn = model.get_layer('Decoder-Batchnorm-1')(dec_emb)

    # Creating a layer for the feedback loop from predictions back into the GRU
    gru_inference_state_input = Input(shape=(latent_dim,), name='hidden_state_input')

    # Crete a layer to reuse the weights
    # There are two outputs, 1- is the embedding layer output for the teacher forcing
    #                        2- is the hidden state
    gru_out, gru_state_out = model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

    # Reconstruct dense layers
    dec_bn2 = model.get_layer('Decoder-Batchnorm-2')(gru_out)
    dense_out = model.get_layer('Final-Output-Dense')(dec_bn2)
    decoder_model = Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])
    return decoder_model

def extract_encoder_model(model):
    """
    Here we extract the encoder from the original Sequence to Sequence Model.
    Input:keras model object with body of issue as input
    Returns: keras model object which is encoding of the issue with the last hidden state
    """
    encoder_model = model.get_layer('Encoder-Model')
    return encoder_model

In [None]:
class Seq2Seq_Inference(object):
    def __init__(self,
                 encoder_preprocessor,
                 decoder_preprocessor,
                 seq2seq_model):

        self.pp_body = encoder_preprocessor
        self.pp_title = decoder_preprocessor
        self.seq2seq_model = seq2seq_model
        self.encoder_model = extract_encoder_model(seq2seq_model)
        self.decoder_model = extract_decoder_model(seq2seq_model)
        self.default_max_len_title = self.pp_title.padding_maxlen
        self.nn = None
        self.rec_df = None

    def generate_issue_title(self,
                             raw_input_text,
                             max_len_title=None):
        """
        To generate a title given the body of an issue usin the seq2seq model .
        Inputs: The body of the issue text as an input string
        max_len_title: The maximum length of the title the model will generate
        """
        if max_len_title is None:
            max_len_title = self.default_max_len_title
        # get the encoder's features for the decoder
        raw_tokenized = self.pp_body.transform([raw_input_text])
        body_encoding = self.encoder_model.predict(raw_tokenized)
        # we want to save the encoder's embedding before its updated by decoder to use as an embedding for other tasks.
        original_body_encoding = body_encoding
        state_value = np.array(self.pp_title.token2id['_start_']).reshape(1, 1)

        decoded_sentence = []
        stop_condition = False
        while not stop_condition:
            preds, st = self.decoder_model.predict([state_value, body_encoding])

            # We are going to ignore indices 0 (padding) and indices 1 (unknown)
            # Argmax will return the integer index corresponding to the prediction + 2 since we chopped off first two
            pred_idx = np.argmax(preds[:, :, 2:]) + 2

            # retrieve word from index prediction
            pred_word_str = self.pp_title.id2token[pred_idx]

            if pred_word_str == '_end_' or len(decoded_sentence) >= max_len_title:
                stop_condition = True
                break
            decoded_sentence.append(pred_word_str)

            # update the decoder for the next word
            body_encoding = st
            state_value = np.array(pred_idx).reshape(1, 1)

        return original_body_encoding, ' '.join(decoded_sentence)


    def print_example(self,
                      i,
                      body_text,
                      title_text,
                      url,
                      threshold):
        """
        Prints examples
        """
        if i:
            print('\n\n==============================================')
            print(f'============== Example # {i} =================\n')

        if url:
            print(url)

        print(f"Issue Body:\n {body_text} \n")

        if title_text:
            print(f"Original Title:\n {title_text}")

        emb, gen_title = self.generate_issue_title(body_text)
        print(f"\n****** Machine Generated Title (Prediction) ******:\n {gen_title}")
        
        if self.nn:
            # return neighbors and distances
            n, d = self.nn.get_nns_by_vector(emb.flatten(), n=4,
                                             include_distances=True)
            neighbors = n[1:]
            dist = d[1:]

            if min(dist) <= threshold:
                cols = ['issue_url', 'issue_title', 'body']
                dfcopy = self.rec_df.iloc[neighbors][cols].copy(deep=True)
                dfcopy['dist'] = dist
                similar_issues_df = dfcopy.query(f'dist <= {threshold}')

                print("\n** Similar Issues (using encoder embedding) **:\n")
                display(similar_issues_df)


    def demo_model_predictions(self,
                               n,
                               issue_df,
                               threshold=1):
        """
        Pick n random Issues and display predictions.
        Input: n- Number of issues to display from issue_df
               issue_df- pandas DataFrame that contains two columns: `body` and `issue_title`.
               threshold- float distance threshold for recommendation of similar issues.
        Output: Prints the original issue body and the model's prediction.
        """
        # Extract body and title from DF
        body_text = issue_df.body.tolist()
        title_text = issue_df.issue_title.tolist()
        url = issue_df.issue_url.tolist()

        if (len(body_text)==1):
            demo_list=[0]
        else:
            demo_list = np.random.randint(low=1, high=len(body_text), size=n)
        for i in demo_list:
            self.print_example(i,
                               body_text=body_text[i],
                               title_text=title_text[i],
                               url=url[i],
                               threshold=threshold)
            
    def prepare_recommender(self, vectorized_array, original_df):
        """
        Use the annoy library to build recommender
        Parameters
        ----------
        vectorized_array : List[List[int]]
            This is the list of list of integers that represents your corpus
            that is fed into the seq2seq model for training.
        original_df : pandas.DataFrame
            This is the original dataframe that has the columns
            ['issue_url', 'issue_title', 'body']
        Returns
        -------
        annoy.AnnoyIndex  object (see https://github.com/spotify/annoy)
        """
        self.rec_df = original_df
        emb = self.encoder_model.predict(x=vectorized_array,
                                         batch_size=vectorized_array.shape[0]//200)

        f = emb.shape[1]
        self.nn = AnnoyIndex(f)
        logging.warning('Adding embeddings')
        for i in tqdm(range(len(emb))):
            self.nn.add_item(i, emb[i])
        logging.warning('Building trees for similarity lookup.')
        self.nn.build(50)
        return self.nn
    
    def evaluate_model(self, holdout_bodies, holdout_titles):
        """
        Method for calculating BLEU Score.
        Parameters
        ----------
        holdout_bodies : List[str]
            These are the issue bodies that we want to summarize
        holdout_titles : List[str]
            This is the ground truth we are trying to predict --> issue titles
        Returns
        -------
        bleu : float
            The BLEU Score
        """
        actual, predicted = list(), list()
        assert len(holdout_bodies) == len(holdout_titles)
        num_examples = len(holdout_bodies)

        logging.warning('Generating predictions.')
        # step over the whole set TODO: parallelize this
        for i in tqdm(range(num_examples)):
            _, yhat = self.generate_issue_title(holdout_bodies[i])

            actual.append(self.pp_title.process_text([holdout_titles[i]])[0])
            predicted.append(self.pp_title.process_text([yhat])[0])
        # calculate BLEU score
        logging.warning('Calculating BLEU.')
        
        #must be careful with nltk api for corpus_bleu!, 
        # expects List[List[List[str]]] for ground truth, using List[List[str]] will give you
        # erroneous results.
        bleu = corpus_bleu([[a] for a in actual], predicted)
        return bleu*4

# Testing results

In [None]:
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                 decoder_preprocessor=title_pp,
                                 seq2seq_model=seq2seq_Model)

In [None]:
seq2seq_inf.demo_model_predictions(n=5, issue_df=testdf)

In [None]:
testdf.head()

# Custom Input

In [None]:
url='https://github.com/github/hub/issues/2634'
title='React-native cant run on AVD'
body='Hi there I recently started working with react native when I started I completely follow the setup docs at https://reactnative.dev/docs/getting-started so after that I tried running the app on an android emulator everything is good I also installed the SDK and have android revision 29 also installed intel 86_64 system image now I got this error also have the environment variables set up even though getting this and im using vs code for devloping'
data = [[url, title, body]]
customdf=pd.DataFrame(data, columns = ['issue_url','issue_title', 'body'])
customdf.head()

In [None]:
seq2seq_inf.demo_model_predictions(n=1, issue_df=customdf)

In [None]:
url='https://github.com/github/hub/issues/2627'
title='How to display the data from the api response in flutter if its not in array format'
body='The problem Im trying to solve:I have problem in displaying the response data in flutter and i am not able to display it without list format How I imagine hub could expose this functionality:'
data = [[url, title, body]]
customdf=pd.DataFrame(data, columns = ['issue_url','issue_title', 'body'])
customdf.head()

In [None]:
seq2seq_inf.demo_model_predictions(n=1, issue_df=customdf)

# Similar titles prediction

In [None]:
# Read All 5M data points
all_data_df = pd.read_csv('../input/github_issues.csv').sample(n=200)
# Extract the bodies from this dataframe
all_data_bodies = all_data_df['body'].tolist()

# transform all of the data using the ktext processor
all_data_vectorized = body_pp.transform_parallel(all_data_bodies)

In [None]:
import dill as dpickle
# save transformed data
with open('all_data_vectorized.dpkl', 'wb') as f:
    dpickle.dump(all_data_vectorized, f)

In [None]:
seq2seq_inf_rec = Seq2Seq_Inference(encoder_preprocessor=body_pp,
                                    decoder_preprocessor=title_pp,
                                    seq2seq_model=seq2seq_Model)
recsys_annoyobj = seq2seq_inf_rec.prepare_recommender(all_data_vectorized, all_data_df)

In [None]:
seq2seq_inf_rec.demo_model_predictions(n=1, issue_df=customdf, threshold=1)

# BLEU Score Calculation

In [None]:
title='Have an option to Choose from one or more github accounts or a login feature and a logout feature '
body='I use two GitHub accounts in my system when I do hub create the hub is creating in the repo in my work account. to switch between them I need to remove the hub file and then re-auth. is there a fix for it already?'
title_list=[title]
body_list=[body]

In [None]:
seq2seq_inf.evaluate_model(holdout_bodies=body_list, holdout_titles = title_list)