In [None]:
import random
import json
import pickle
import numpy as np
import tensorflow as tf
from nltk.stem import WordNetLemmatizer

class Chatbot:
    def __init__(self):
        # instantiates the lemmatizer, which is important for reducing word to base form
        self.__lemmatizer = WordNetLemmatizer()
        self.__vocab = []  # holds all the words that are understood by the model
        self.__classes = []  # holds all the classes from the intents file
        # matches each possible input with the associated class.
        self.__documents = []
        # a list of characters to ignore. (adds noise without adding value)
        self.__IGNORE_CHAR = ["?", "!", ".", ","]
        # data that is used for training, a vectorised form of all the recognised inputs
        self.__training = []
        self.__model = None  # holds the actual trained model.

    def create_model(self, file_to_load_from: str, file_to_save_to="model.h5"):
        """
        Creates a model from a file and saves that model to another file

        Args:
            file_to_load_from (str): The name of the file to which the original data is loaded from. This is a .json file
            {"intents": [{"tag": <tag-name>, "patterns": [<responses-to-look-for>], "responses": [<reponses-of-the-machine>]}], ...}

            file_to_save_to (str, optional): The file to which the model will be saved to. Defaults to 'model.h5'.
        """

        self.__file_to_load_from = file_to_load_from

        self.__load_intents()  # loads the intents from the file
        self.__preprocess_data()  # preprocess and cleans the data
        self.__create_training_data()  # creates the training data
        self.__create_model()  # creates the model
        self.__train_model()  # trains the model
        self.__save_model(model_file=file_to_save_to)  # saves the model

    def __load_intents(self):
        """
        Loads the model from the .json file
        """

        # loads the model from the file
        with open(self.__file_to_load_from, "r") as file:
            self.__intents = json.load(file)

    def __preprocess_data(self) -> None:
        """
        Preprocesses the data by tokenizing the patterns, cleanse the data, adding words to the machine's vocabulary, and creating a list of documents with their corresponding tags.
        """

        # iterates through each intent passed from the intents file.
        for intent in self.__intents["intents"]:
            # FOR EACH PATTERN:
            #   tokenise the pattern to make a list of words
            #   add those words to the machines vocabulary.
            #   append the list and tag to that word list, ([<tokenised input>], <class-name>)

            for pattern in intent["patterns"]:
                wordList = pattern.split()
                # adds all the words from the pattern to the vocab
                self.__vocab.extend(wordList)
                self.__documents.append((wordList, intent["tag"]))

                # if the tag is not in the classes list, then add the tag
                if intent["tag"] not in self.__classes:
                    self.__classes.append(intent["tag"])

        # for each word in the vocabulary, use the lemmatizer to remove any unneccesary suffixes
        self.__vocab = [
            self.__lemmatizer.lemmatize(word)
            for word in self.__vocab
            if word not in self.__IGNORE_CHAR
        ]

        # create a sorted set of the vocab, and a sorted list of the classes
        self.__vocab = sorted(set(self.__vocab))
        self.__classes = sorted(self.__classes)

    def __create_training_data(self):
        """
        Vectorises the data and splits the data into 'bags' and 'rows'.
        """
        # iterates through each item in the self.__documents
        for document in self.__documents:
            # Matrix where each row represents the vocabulary.
            # If the word in the vocabulary is also in the document, then that will be 1 to show that it is present in both.
            bag = []

            # gets the tokenised words
            word_patterns = document[0]

            # converts all words to lowercase and the lemmatizer reduces it to its base dictionary form
            word_patterns = [
                self.__lemmatizer.lemmatize(word.lower()) for word in word_patterns
            ]

            # iterates through each word in vocab
            # create a vectorised from of the user input
            # against the vocabulary, if there is an instance in the user input, make that associated index 1.
            for word in self.__vocab:
                bag.append(1) if word in word_patterns else bag.append(0)

            # creates an output row that is as long as the length of the classes
            # each index corresponds to a class
            # sets the index corresponding to the class tag to 1
            output_row = list([0] * len(self.__classes))
            output_row[self.__classes.index(document[1])] = 1

            # append the bag of word and the output row to the training list as one list
            self.__training.append(bag + output_row)

        # shuffles the data, forces the model to learn patterns from relationships rather than sequence of data
        random.shuffle(self.__training)

        # converts into a numpy array, more efficent and can be easily manipulated, which is needed from training
        self.__training = np.array(self.__training)

        # splits the data into the bag (train_x) and row (train_y)
        self.__train_x = self.__training[:, : len(self.__vocab)]
        self.__train_y = self.__training[:, len(self.__vocab) :]

    def __create_model(self):
        """
        Prepares the model for training by creating, optimising and then compiling the model.
        """

        # Prepares the model for training #

        self.__model = tf.keras.Sequential(
            [
                # creates the first later with 128 neurons
                # input_shape tells the model of how many inputs to expect.
                # uses a Reftified Linear Unit, which allows the model to learn from more complex patterns
                tf.keras.layers.Dense(
                    128, input_shape=(len(self.__train_x[0]),), activation="relu"
                ),
                # Sets half the inputs to 0, so that it learns patterns rather than details
                # makes predictions less reliant on a single neuron/node
                tf.keras.layers.Dropout(0.5),
                # Creates the second later with 64 neurons
                tf.keras.layers.Dense(64, activation="relu"),
                # preventing overfitting again
                tf.keras.layers.Dropout(0.5),
                # output layer, with the same number of neutrons as the output vector
                # softmax used to give a probability over the differnt classes, that sum to 1.
                tf.keras.layers.Dense(len(self.__train_y[0]), activation="softmax"),
            ]
        )

        # optimises the model
        # learning  => controls the size of the steps the optimizer uses when balancing the weights
        # momentum  => dictates how much of the previous data is used to make the current update
        # nesterov  => updates the weights and makes corrections quickly, results in faster processing.

        sgd = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)

        # Prepares the model for training.
        # loss      => how good the model is compared to the actual data, measures the difference between the predicted probabilities and the actual distribution
        # optimizer => changes the attribues liek weight and learning rate reduce loss
        # metrics   => used to evaluate performace, how correct its predictions are

        self.__model.compile(
            loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"]
        )

    def __train_model(self):
        """
        Trains the model on the given data.
        """

        # Responsible for training the model using the data.
        # epoch      => how many times the model will iterate over the training databset
        # batch_size => model will update its weights after 5 samples
        # verbose    => showing the information from training (0 -> show nothing, 1-> show progress bar, 2 -> more in depth)
        self.hist = self.__model.fit(
            self.__train_x, self.__train_y, epochs=200, batch_size=5, verbose=1
        )

    def __save_model(self, model_file="model.h5"):
        """
        Saves the model, words and classes to a file, which is needed for generating responses.

        Args:
            model_file (str, optional): The name of the file that the model will be saved to. Defaults to 'model.h5'.
        """

        # Saves the models, words and classes
        self.__model.save(model_file, self.hist)
        with open("vocab.pkl", "wb") as file:
            pickle.dump(self.__vocab, file)
        with open("classes.pkl", "wb") as file:
            pickle.dump(self.__classes, file)




class ResponseGeneration:
    def __init__(self, intents='intents.json', model_file='model.h5'):
        self.__CONNECTIVES = [
                                "also",
                                "furthermore",
                                "moreover",
                                "additionally",
                                "in addition",
                                "likewise",
                                "as well as",
                                "together with",
                                "alongside",
                                "besides",
                                "plus",
                                "too",
                                "in the same way",
                                "likewise",
                                "similarly",
                                "coupled with",
                                "furthermore",
                                "equally",
                                "and"
                            ]


        self.__lemmatizer = WordNetLemmatizer() # instantiates the lemmatizer, which is important for reducing word to base form

        # reads the intents, vocabulary and classes file
        self.__intents = json.loads(open(intents).read()) 
        self.__vocab = pickle.load(open('vocab.pkl', 'rb'))
        self.__classes = pickle.load(open('classes.pkl', 'rb'))

        # also loads the model from the file
        self.__model = tf.keras.models.load_model(model_file)

    def __clean(self, sentence: str) -> list:
        """
        Cleanses and tokenises the data.

        Args:
            sentence (str): The sentence to cleanse.

        Returns:
            list: The cleansed, tokenised data.
        """

        # tokenises and lemmatizes (means to revert all words to base form) the user input
        sentence_words = sentence.split()

        # lemmatizes each word in the text
        sentence_words = [ self.__lemmatizer.lemmatize(word) for word in sentence_words]

        # return the tokenised, lemmatized text
        return sentence_words

    def __bag_of_words(self, sentence):

        # Cleans and tokenises the data
        sentence_words = self.__clean(sentence)

        # Create a bag as long as the vocabulary
        bag = [0] * len(self.__vocab)

        # create a vectorised from of the user input
        # against the vocabulary, if there is an instance in the user input, make that associated index 1.
        for w in sentence_words:
            for i, word in enumerate(self.__vocab):
                if word == w:
                    bag[i] = 1

        # return the vectorised form of the user input
        return np.array(bag)

    def __predict_class(self, sentence):
        
        # turns the input into a vector
        bow = self.__bag_of_words(sentence) 

        # uses the pre-trained model to predict the class of the input sentence 
        # [class1, class2, class3]
        # [ prob1,  prob2,  prob3], sum = 1 
        res = self.__model.predict(np.array([bow]))[0]

        # only uses responses that have a probability >0.25 are chosen
        error_thresh = 0.25
        results = [[i, r] for i, r in enumerate(res) if r > error_thresh]

        # sorts so the greatest probability is in index 0
        results.sort(key=lambda x: x[1], reverse=True)
        
        # for result in results, create a dictionary of the intent and the probability of the intent.
        # stores the dictioanry in a list
        return_list = [{'intent': self.__classes[r[0]], 'probability': str(r[1])} for r in results]
        
        return return_list

    def __find_response(self, intents_list):

        # gets the tag of the most probable response
        tag = intents_list[0]['intent']

        # go through all the data
        for intent in self.__intents['intents']:
            # find the tag for a piece of data is the tag of the most probable response
            # return a random response from the list of responses it has
            if intent['tag'] == tag:
                return random.choice(intent['responses'])
            
        # cannot find
        return "I am sorry, I do not understand the question. Could you try and rephrase it for me please."

    def get_response(self, message):
        
        # able to handle multiple requests.
        # <request> ::= <request><connective><request>

        input_components, component = [], []
        
        # splits the message into in tokens and iterates through.
        # splits the input by connectives to form individiual requests.
        for word in message.split():

            # if it is not a connective, then add that word to the component list
            if word not in self.__CONNECTIVES: component.append(word)

            # if that word is a connective, then that is one complete request
            # add that to the input component list and reset the componnet list
            else:
                input_components.append(" ".join(component))
                component = []

        # add the trailing request to the component list
        input_components.append(" ".join(component))

        # initialises a list for all full response.
        full_response = []

        # iterates through each input component and generate a response for that componenet.
        # adds the return componented to the response list.
        input(input_components)
        for r in input_components:

            # predicts the class of the given class, thus establish the context of the input.
            intents_list = self.__predict_class(r)
            full_response.append(self.__find_response(intents_list))

        # returns the response.
        return "\n".join(full_response)

if __name__ == "__main__":
    rg = ResponseGeneration()

    rg.get_response("Hello, who are)
