In [1]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
with open("dialog_acts.dat", 'r') as file:
    data = {'label': [], 'sentence': [], 'prediction': []}
    
    for line in file:
        words = line.split(maxsplit=1)
        if len(words) > 1:
            data['label'].append(words[0])
            data['sentence'].append(words[1])
        else:
            data['label'].append(words[0])
            data['sentence'].append('')

In [3]:
for sentence in data["sentence"]:
    data["prediction"].append("inform")

In [4]:
if len(data['label']) != len(data['prediction']):
    print("Error: The number of labels and predictions do not match.")
else:
    correct = 0
    total = len(data['label'])

    for i in range(total):
        if data['label'][i] == data['prediction'][i]:
            correct += 1

    accuracy = correct / total * 100
    print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 39.84%


# Keywords

In [5]:
def count_labels(data):
    label_counts = {}

    # Count the occurrences of each label
    for label in data['label']:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

    # Print the count of each label
    for label, count in label_counts.items():
        print(f"{label}: {count}")

In [6]:
count_labels(data)

inform: 10160
confirm: 172
affirm: 1156
request: 6494
thankyou: 3259
null: 1612
bye: 266
reqalts: 1747
negate: 435
hello: 93
repeat: 33
ack: 28
restart: 14
deny: 27
reqmore: 5


In [7]:
# keywords = {
#     'ack': ['kay', 'okay'],
#     'affirm': ['yes', 'right', 'yeah'],
#     'bye': ['bye'],
#     'confirm': ['is it', 'does it', 'do they'],
#     'deny': ['wrong', 'dont want', 'no']
# }

In [8]:
keywords = {
    'ack': ['kay', 'okay'],
    'affirm': ['yes', 'right', 'yeah'],
    'thankyou': ['thank'],
    'bye': ['bye', 'goodbye'],
    'confirm': ['is it', 'does it', 'do they'],
    'deny': ['wrong', 'dont', 'not'],
    'hello': ['hi', 'hello', 'halo', 'welcome'],
    'inform': [
        'looking', 'restaurant', 'any', 'food', 'part', 'town', 'cheap', 'expensive', 
        'mediterranean', 'seafood', 'east', 'west', 'north', 'south', 'asian', 
        'oriental', 'scottish', 'matter', 'european', 'want', 'care', 'austrian', 
        'center', 'corsica', 'international', 'priced', 'moderately', 'moderate', 
        'central', 'eirtrean', 'spanish', 'venue', 'australian', 'turkish'
    ],
    'negate': ['no'],
    'null': [],
    'repeat': ['repeat', 'again', 'back'],
    'reqalts': ['how about', 'what about', 'is there', 'anything else'],
    'reqmore': ['more'],
    'request': ['address', 'phone', 'number', 'post code', 'how much', 'where', 'whats', 'what is', 'price range'],
    'restart': ['start', 'reset'],
}


In [9]:
# Not 2 word friendly

# def classify_sentence(sentence, keywords):
#     # Convert the sentence to lowercase and tokenize it
#     tokens = re.findall(r'\b\w+\b', sentence.lower())
    
#     # Check each label and its associated keywords
#     for label, kws in keywords.items():
#         # Check if any keyword is in the tokens
#         if any(keyword in tokens for keyword in kws):
#             return label
            
#     # Return 'null' if no keywords are found
#     return 'null'

In [10]:
def classify_sentence(sentence, keywords):
    # Convert the sentence to lowercase
    sentence_lower = sentence.lower()
    
    # Sort keywords by length in descending order to match longer phrases first
    sorted_keywords = sorted(((label, keyword) for label, kw_list in keywords.items() for keyword in kw_list),
                              key=lambda x: len(x[1]), reverse=True)
    
    # Check if any keyword is in the sentence
    for label, keyword in sorted_keywords:
        if keyword in sentence_lower:
            return label
    
    # Return 'null' if no keywords are found
    return 'null'

In [11]:
data['prediction'] = [classify_sentence(sentence, keywords) for sentence in data['sentence']]

In [12]:
def calculate_accuracy_filtered(labels, predictions):
    filtered_labels = []
    filtered_predictions = []
    
    for label, prediction in zip(labels, predictions):
        filtered_labels.append(label)
        filtered_predictions.append(prediction)
    
    if len(filtered_labels) != len(filtered_predictions):
        raise ValueError("Filtered labels and predictions lists must be of the same length.")
    
    correct = sum(1 for l, p in zip(filtered_labels, filtered_predictions) if l == p)
    total = len(filtered_labels)
    accuracy = (correct / total) * 100 if total > 0 else 0
    return accuracy

accuracy = calculate_accuracy_filtered(data['label'], data['prediction'])
print(accuracy)

81.48307909493745


In [13]:
from collections import defaultdict

def calculate_accuracy_per_label(labels, predictions):
    label_counts = defaultdict(int)  # To count occurrences of each label
    correct_counts = defaultdict(int)  # To count correct predictions for each label

    for label, prediction in zip(labels, predictions):
        label_counts[label] += 1
        if label == prediction:
            correct_counts[label] += 1

    accuracy_per_label = {}
    for label in label_counts:
        total = label_counts[label]
        correct = correct_counts[label]
        accuracy_per_label[label] = (correct / total) * 100 if total > 0 else 0
    
    return accuracy_per_label

accuracy_per_label = calculate_accuracy_per_label(data['label'], data['prediction'])

# Print or inspect the accuracy per label
for label, accuracy in accuracy_per_label.items():
    print(f"Label: {label}, Accuracy: {accuracy:.2f}%")


Label: inform, Accuracy: 79.60%
Label: confirm, Accuracy: 51.16%
Label: affirm, Accuracy: 80.62%
Label: request, Accuracy: 88.10%
Label: thankyou, Accuracy: 87.11%
Label: null, Accuracy: 61.29%
Label: bye, Accuracy: 74.44%
Label: reqalts, Accuracy: 85.00%
Label: negate, Accuracy: 70.11%
Label: hello, Accuracy: 49.46%
Label: repeat, Accuracy: 100.00%
Label: ack, Accuracy: 60.71%
Label: restart, Accuracy: 92.86%
Label: deny, Accuracy: 81.48%
Label: reqmore, Accuracy: 100.00%


In [14]:
# for label, sentence, prediction in zip(data['label'], data['sentence'], data['prediction']):
#     if label == 'confirm':
#         print(f"Sentence: {sentence}, Prediction: {prediction}")

In [15]:
def classify_sentence(sentence):
    for label, words in keywords.items():
        print('words')
        if any(word in sentence.lower() for word in words):
            return label
    return 'Unknown'  # Default label if no keywords are matched

In [16]:
while True:
    # Get user input
    sentence = input("Enter a sentence to classify (or type 'exit' to stop): ")
    
    # Exit condition
    if sentence.lower() == 'exit':
        print("Exiting the classifier.")
        break
    
    # Classify the sentence
    label = classify_sentence(sentence)
    print(f"Classified as: {label}")

Enter a sentence to classify (or type 'exit' to stop):  exit


Exiting the classifier.


# Feed Forward Neural Network

In [17]:
# Convert all labels to numbers
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [18]:
max_words = 10000  # Maximum number of words
max_len = 128  # Maximum sentence length

In [19]:
# Tokenize all words
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(data['sentence'])
sequences = tokenizer.texts_to_sequences(data['sentence'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [20]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    padded_sequences, data['label'], test_size=0.2, random_state=42
)

In [21]:
def create_model():
    model = models.Sequential()
    
    # Embedding Layer
    model.add(layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    
    # Fully connected layers
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(15, activation='softmax'))  # 15 classes
    
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [22]:
model = create_model()
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 128)          1280000   
                                                                 
 flatten (Flatten)           (None, 16384)             0         
                                                                 
 dense (Dense)               (None, 256)               4194560   
                                                                 
 dense_1 (Dense)             (None, 15)                3855      
                                                                 
Total params: 5478415 (20.90 MB)
Trainable params: 5478415 (20.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [23]:
history = model.fit(train_sentences, train_labels, epochs=5, batch_size=16, validation_split=0.2)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [24]:
test_loss, test_acc = model.evaluate(test_sentences, test_labels)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

Test Accuracy: 98.29%


In [25]:
unique_labels = sorted(list(set(data['label'])))

In [26]:
def classify_sentence_fnn(sentence):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=128, padding='post')
    
    # Get prediction from the model
    prediction = model.predict(padded_sequence)
    
    # Get the index of the highest probability
    predicted_label_index = np.argmax(prediction, axis=1)[0]
    predicted_label = le.inverse_transform([predicted_label_index])[0]

    return predicted_label

In [28]:
while True:
    sentence = input("Enter a sentence to classify (or type 'exit' to stop): ")
    
    if sentence.lower() == 'exit':
        print("Exiting the classifier.")
        break
    
    # Classify the sentence using the FNN model
    label = classify_sentence_fnn(sentence)
    
    print(f"Classified as: {label}")

Enter a sentence to classify (or type 'exit' to stop):  exit


Exiting the classifier.
