### Imports

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import numpy as np
import os
import json 
import glob
import random

### Data Parsing

In [None]:
def printConvo(userData, systemData):
    """
    This function takes the tow files input userData and systemData
    And prints conversation in turns in the terminal 
    With the for loop for the user and the system to take turns 
    and print the coherent conversation 
    """
    print("session id:", userData["session-id"])

    print(userData["task-information"]["goal"]["text"])

    temp = 0
    for speach in systemData["turns"]:
        print("system:", speach["output"]["transcript"])
        print("user:", userData["turns"][temp]["transcription"])
        temp += 1

def writeFiles(userData, systemData, conversationsFile, utterancesFile):
    """
    writeFiles function takes the input data and creates new txt file with all the conversations between 
    the bot and user that has occurred
    Using the for loop in order to maintain the natural occurrences  of the conversation
    
    """
    conversationsFile.write("session id: %s\n" % userData["session-id"])
    conversationsFile.write("%s\n" % userData["task-information"]["goal"]["text"])

    temp = 0
    for speach in systemData["turns"]:
        
        dialogAct = userData["turns"][temp]["semantics"]["cam"]
        utteranceContent = userData["turns"][temp]["transcription"]
        
        conversationsFile.write("system: %s\n" %  speach["output"]["transcript"])
        conversationsFile.write("user: %s\n" % utteranceContent)
        utterancesFile.write("%s %s\n" % (dialogAct.split("(")[0], utteranceContent.lower()))
        
        temp += 1
    
    conversationsFile.write("\n\n")

def writeFile(waiting):
    """
    This function takes the Boolean values as an input
    and opens the json files
    If the input is False firstly it opens the files to scribe conversations  into 
    if input is True the function calls printConvo function and the conversation is being displayed in the 
    terminal
    
    """
    if waiting is False:
        conversationsFile = open("convo.txt", "w+")
        utterancesFile = open("utterances.txt", "w+")
    
    userFiles = glob.glob('**/label.json', recursive=True)
    systemFiles = glob.glob('**/log.json', recursive=True)
    i = 0
    for userFile in userFiles:
        with open(userFile) as f:
            userData = json.load(f)
        with open(systemFiles[i]) as f:
            systemData = json.load(f)
        i += 1
        if waiting is True :
            printConvo(userData, systemData)
            input('press Enter to display another chat both conversation')
        else: 
            writeFiles(userData, systemData, conversationsFile, utterancesFile)

    if waiting is False: 
        conversationsFile.close()
        utterancesFile.close()

writeFile(False)

### Splitting Sets

In [None]:
def createDataSets():
    """
    createDataSets function opens the file with the conversations  
    iterates though the conversations creating the data sets and splitting  85-15 for the training and test sets
    
    returns dictionaries training set and test set
    """
    utterancesFile = open("utterances.txt", "r")
    utterancesData = {}
    for line in utterancesFile:
        line = line.split(" ", 1)
        if line[0] not in utterancesData:
            utterancesData[line[0]] = []    
        utterancesData[line[0]].append(line[1].strip())

    trainingData = {}
    testData = {}
    for dialogType in utterancesData:
        splitted = np.split(utterancesData[dialogType], [int(len(utterancesData[dialogType]) * 0.85)])
        trainingData[dialogType] = splitted[0].tolist()
        testData[dialogType] = splitted[1].tolist()
    
    utterancesFile.close()
        
    return trainingData, testData

createDataSets()

### Rule Based Classifier

In [None]:
def ruleBasedClassifier(sentence):
    """
    This function takes the sentence and classify it accordingly to the created keywords
    Loops through the categories and checks whether the keyword is in the sentence
    Return None
    
    """
    
    keywords = {"affirm":["yes", "yeah", "yea"],
                "confirm":["is it", "does it"],
                "bye": ["bye"],
                "deny":["no", "do not", "dont"],
                "hello":["hi", "hello"],
                "inform":["any", "dont care", "do not care", "doesnt matter", "does not matter","south",
                "north","east","west","centre", "center", "expensive", "moderately","moderate","cheap",
                "creative", "christmas","halal", "vegetarian","indian","cantonese","american","persian",
                "european","chinese","sea food","spanish","portuguese","italian","mediterranean","gastropub",
                "steak","bistro","british","japanese","danish","lebanese","caribbean","thai","asian","welsh",
                "french","australian","brazilian","irish","english","polynesian","corsica","vietnamese","turkish",
                "mexican","moroccan"],
                "negate":["no"],
                "repeat":["repeat"],
                "requalts": ["what about","how about","else","anything","different"],
                "reqmore":["more"],
                "request":["address","area","location","type", "type of food","price","phone","phonenumber","telephone","post code", "postcode"],
                "restart":["start over", "restart"],
                "ack" : ["okay"],
                "thankyou":["thank you", "thanks"],
                "null": ["noise", "unintelligible"]}

    #Loops through the categories and checks whether a the keyword is in the sentence
    for category in keywords:
        for keyword in keywords[category]:
            if keyword in sentence:
                return category
            
    return None

### Rule Base Classifier Test

In [2]:
def testRuleBasedClassifier(testData):
    """
    Iterates though the input data, calls the function ruleBasedClassifier for each of the sentences and checks
    if it was classified correctly
    Takes the correctly classified points and divides it with the total amount of points and prints the 
    percentage  of correctly classified points with ruleBasedClassifier
   
    """

    total = 0
    amountRight = 0
    for actCategory in testData:
        total += len(testData[actCategory])
        for sentence in testData[actCategory]:
            category = ruleBasedClassifier(sentence)
            if category == actCategory:
                amountRight += 1
                
    proportion = (amountRight / total) * 100
    print("Accuracy = ", proportion, "%")

def manualRuleBasedClassifier():
    """
    allows the user to input the sentence and with function ruleBasedClassifier 
    classify user input and prints the result of the classification 
    """
    while True:
        print("Write the next sentence: ")
        sentence = input()
        if (sentence == 'exit'):
            break
        print("The category of your sentence is: ", ruleBasedClassifier(sentence))
        
    
trainingData, testData = createDataSets()
testRuleBasedClassifier(testData)
manualRuleBasedClassifier()

NameError: name 'createDataSets' is not defined

### Proportional Based Classifier

In [None]:
def calculateProportions(trainingData):
    """
    Calculates the proportion of the data regarding its categories 
    """
    total = 0
    proportions = {}
    
    for actCategory in trainingData:
        total += len(trainingData[actCategory])
    
    for actCategory in trainingData:
        proportions[actCategory] = len(trainingData[actCategory]) * 100 / total
    
    return proportions
    

def proportionalBasedClassifier(proportions):
    return random.choices(list(proportions.keys()), list(proportions.values()))
    
trainingData, testData = createDataSets()
proportions = calculateProportions(trainingData)

### Proportional Based Classifier Test

In [3]:
def testProportionalBasedClassifier(testData, proportions):
    """
    The function takes as in input the data and the calculated proportions 
    Loops though the data and classify each sentence with the proportionalBasedClassifier
    prints the accuracy percentage of correctly classified sentences
    """

    total = 0
    amountRight = 0
    for actCategory in testData:
        total += len(testData[actCategory])
        for sentence in testData[actCategory]:
            category = proportionalBasedClassifier(proportions)
            if category[0] == actCategory:
                amountRight += 1
                
    proportion = (amountRight / total) * 100
    print("Accuracy = ", proportion, "%")

def manualProportionalBasedClassifier(proportions):
    """
    Takes the input of the user and using the proportionalBasedClassifier classifies  the input
    """
    while True:
        print("Write the next sentence: ")
        sentence = input()
        if (sentence == 'exit'):
            break
        print("The category of your sentence is: ", proportionalBasedClassifier(proportions)[0])
        
trainingData, testData = createDataSets()
proportions = calculateProportions(trainingData)
testProportionalBasedClassifier(testData, proportions)
manualProportionalBasedClassifier(proportions)

NameError: name 'createDataSets' is not defined

### ML classifier

In [None]:
'''
def tokenizeSentence(vocab, sentence):
    words = word_extraction(sentence)
    bag_vector = numpy.zeros(len(vocab))
    for w in words:
        for i,word in enumerate(vocab):
            if word == w:
                bag_vector[i] += 1
    
    return bag_vector

def manualMLClassifier(classifier, vocab):
    
    while True:
        print("Write the next sentence: ")
        sentence = input()
        if (sentence == 'exit'):
            break
        vector = tokenizeSentence(vocab, sentence)
        print("The category of your sentence is: ", classifier.predict(vector))
'''

def tokenizeData(trainingData, testData):
    """
    This function takes the input of the training and test data 
    For both it iterates  though the data separating  the sentences and its type
    Transform  the array of sentences into bag of words representation 
    
    """
    X = []
    Y = []
    vectorizer = CountVectorizer()
    
    for dialogType in trainingData:        
        for sentence in trainingData[dialogType]:
            X.append(sentence)
            Y.append(dialogType)
            
    for dialogType in testData:        
        for sentence in testData[dialogType]:
            X.append(sentence)
            Y.append(dialogType)
            
    X = vectorizer.fit_transform(X)
    
    return X.toarray(), Y,

trainingData, testData = createDataSets()
X, Y = tokenizeData(trainingData, testData)
#Splits the data for the train and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15)
# Creates the Decision tree classifier  
clf = DecisionTreeClassifier(random_state=0)
# fits the data into the created model 
clf = clf.fit(X_train, Y_train)
# prints the accuracy of the classifier 
print("Accuracy: ", clf.score(X_test, Y_test) * 100 , "%")