In [1]:
import pandas as pd

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
with open("dialog_acts.dat", 'r') as file:
    data = {'label': [], 'sentence': [], 'prediction': []}
    
    for line in file:
        words = line.split(maxsplit=1)
        if len(words) > 1:
            data['label'].append(words[0])
            data['sentence'].append(words[1])
        else:
            data['label'].append(words[0])
            data['sentence'].append('')

In [3]:
for sentence in data["sentence"]:
    data["prediction"].append("inform")

In [4]:
if len(data['label']) != len(data['prediction']):
    print("Error: The number of labels and predictions do not match.")
else:
    correct = 0
    total = len(data['label'])

    for i in range(total):
        if data['label'][i] == data['prediction'][i]:
            correct += 1

    accuracy = correct / total * 100
    print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 39.84%


In [5]:
def count_labels(data):
    label_counts = {}

    # Count the occurrences of each label
    for label in data['label']:
        if label in label_counts:
            label_counts[label] += 1
        else:
            label_counts[label] = 1

    # Print the count of each label
    for label, count in label_counts.items():
        print(f"{label}: {count}")

In [6]:
count_labels(data)

inform: 10160
confirm: 172
affirm: 1156
request: 6494
thankyou: 3259
null: 1612
bye: 266
reqalts: 1747
negate: 435
hello: 93
repeat: 33
ack: 28
restart: 14
deny: 27
reqmore: 5


In [7]:
keywords = {
    'ack': ['kay', 'okay'],
    'affirm': ['yes', 'right', 'yeah'],
    'bye': ['bye'],
    'confirm': ['is it', 'does it', 'do they'],
    'deny': ['wrong', 'dont want', 'no']
}

In [8]:
def classify_sentence(sentence, keywords):
    for label, words in keywords.items():
        if any(word in sentence.lower() for word in words):
            return label
    return 'Unknown'  # or some default label

In [9]:
data['prediction'] = [classify_sentence(sentence, keywords) for sentence in data['sentence']]

In [10]:
def calculate_accuracy_filtered(labels, predictions, valid_labels):
    filtered_labels = []
    filtered_predictions = []
    
    for label, prediction in zip(labels, predictions):
        if label in valid_labels:
            filtered_labels.append(label)
            filtered_predictions.append(prediction)
    
    if len(filtered_labels) != len(filtered_predictions):
        raise ValueError("Filtered labels and predictions lists must be of the same length.")
    
    correct = sum(1 for l, p in zip(filtered_labels, filtered_predictions) if l == p)
    total = len(filtered_labels)
    accuracy = (correct / total) * 100 if total > 0 else 0
    return accuracy

valid_labels = {'ack', 'affirm', 'bye', 'confirm', 'deny'}
accuracy = calculate_accuracy_filtered(data['label'], data['prediction'], valid_labels)
print(accuracy)

88.59915100060643


In [11]:
def classify_sentence(sentence):
    for label, words in keywords.items():
        if any(word in sentence.lower() for word in words):
            return label
    return 'Unknown'  # Default label if no keywords are matched

In [12]:
while True:
    # Get user input
    sentence = input("Enter a sentence to classify (or type 'exit' to stop): ")
    
    # Exit condition
    if sentence.lower() == 'exit':
        print("Exiting the classifier.")
        break
    
    # Classify the sentence
    label = classify_sentence(sentence)
    print(f"Classified as: {label}")

Enter a sentence to classify (or type 'exit' to stop):  exit


Exiting the classifier.


# Feed Forward Neural Network

In [13]:
# Convert all labels to numbers
le = LabelEncoder()
data['label'] = le.fit_transform(data['label'])

In [14]:
max_words = 10000  # Maximum number of words
max_len = 128  # Maximum sentence length

In [15]:
# Tokenize all words
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(data['sentence'])
sequences = tokenizer.texts_to_sequences(data['sentence'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [16]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    padded_sequences, data['label'], test_size=0.2, random_state=42
)

In [17]:
def create_model():
    model = models.Sequential()
    
    # Embedding Layer
    model.add(layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    
    # Fully connected layers
    model.add(layers.Flatten())
    model.add(layers.Dense(256, activation='relu'))
    model.add(layers.Dense(15, activation='softmax'))  # 15 classes
    
    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [18]:
model = create_model()
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 128, 128)          1280000   
                                                                 
 flatten (Flatten)           (None, 16384)             0         
                                                                 
 dense (Dense)               (None, 256)               4194560   
                                                                 
 dense_1 (Dense)             (None, 15)                3855      
                                                                 
Total params: 5478415 (20.90 MB)
Trainable params: 5478415 (20.90 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
history = model.fit(train_sentences, train_labels, epochs=5, batch_size=16, validation_split=0.2)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [20]:
test_loss, test_acc = model.evaluate(test_sentences, test_labels)
print(f"Test Accuracy: {test_acc * 100:.2f}%")

Test Accuracy: 98.61%


In [21]:
unique_labels = sorted(list(set(data['label'])))

In [37]:
def classify_sentence_fnn(sentence):
    # Tokenize and pad the input sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(sequence, maxlen=128, padding='post')
    
    # Get prediction from the model
    prediction = model.predict(padded_sequence)
    
    # Get the index of the highest probability
    predicted_label_index = np.argmax(prediction, axis=1)[0]
    predicted_label = le.inverse_transform([predicted_label_index])[0]

    return predicted_label

In [38]:
import numpy as np

In [39]:
while True:
    sentence = input("Enter a sentence to classify (or type 'exit' to stop): ")
    
    if sentence.lower() == 'exit':
        print("Exiting the classifier.")
        break
    
    # Classify the sentence using the FNN model
    label = classify_sentence_fnn(sentence)
    
    print(f"Classified as: {label}")

Enter a sentence to classify (or type 'exit' to stop):  where is my suit?


12
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: request


Enter a sentence to classify (or type 'exit' to stop):  no


7
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: negate


Enter a sentence to classify (or type 'exit' to stop):  okay


8
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: null


Enter a sentence to classify (or type 'exit' to stop):  kay


0
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: ack


Enter a sentence to classify (or type 'exit' to stop):  goodbye


2
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: bye


Enter a sentence to classify (or type 'exit' to stop):  thankyou


8
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: null


Enter a sentence to classify (or type 'exit' to stop):  yes


1
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
Classified as: affirm


Enter a sentence to classify (or type 'exit' to stop):  exit


Exiting the classifier.
