In [6]:
data_path = "/content/dialog_acts.dat"
def cargar_dataset(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # space split
            parts = line.strip().split(' ', 1)
            if len(parts) == 2:
                dialog_act, utterance_content = parts
                data.append((dialog_act, utterance_content))
            else:
                print(f"Incorrect line: {line}")
    return data

data = cargar_dataset(data_path)

In [7]:
def baseline_classifier(dataset):
    predictions = []
    for i in range(len(dataset)):
        predictions.append('inform')  # by default
    return predictions

#  Evaluation
def evaluate(predictions, dataset):
    correct = 0
    total = len(dataset)

    for i, (dialog_act, _) in enumerate(dataset):
        if predictions[i] == dialog_act:
            correct += 1

    accuracy = correct / total if total > 0 else 0
    print(f"Accuracy: {accuracy:.2f}")

In [8]:
predictions = baseline_classifier(data)
evaluate(predictions,data)

Accuracy: 0.40


In [9]:
def filtrar_dataset(dataset, dialog_acts_filtrar):
    dataset_filtrado = []

    for dialog_act, utterance_content in dataset:
        if dialog_act in dialog_acts_filtrar:
            dataset_filtrado.append((dialog_act, utterance_content))

    return dataset_filtrado

In [10]:
acts = ["hello", "inform", "negate", "null", "repeat"]
mydata = filtrar_dataset(data,acts)

In [11]:
# Keywords
def rule_based_classifier(dataset):
    predictions = []

    for _, utterance in dataset:

        utterance_lower = utterance.lower()

        # Rule 'hello'
        if any(keyword in utterance_lower for keyword in ['hi', 'hello', 'halo', 'welcome']):
            predictions.append('hello')

        # Rule 'inform'
        elif any(keyword in utterance_lower for keyword in ['looking', 'restaurant', 'any', 'food', 'part', 'town',
                                                           'cheap', 'expensive', 'mediterranean', 'seafood', 'east',
                                                           'west', 'north', 'south', 'asian', 'oriental', 'scottish',
                                                           'matter', 'european', 'want', 'care', 'austrian', 'center',
                                                           'corsica', 'international', 'priced', 'moderately', 'moderate',
                                                           'central', 'eirtrean', 'spanish', 'venue', 'australian', 'turkish']):
            predictions.append('inform')

        # Rule 'negate'
        elif 'no' in utterance_lower:
            predictions.append('negate')

        # Rule 'null'
        elif any(keyword in utterance_lower for keyword in ['noise', 'sil', 'cough', 'unintelligible', 'tv_noise',
                                                           'hm', 'survey', 'sorry', 'left']):
            predictions.append('null')

        # Rule 'repeat'
        elif any(keyword in utterance_lower for keyword in ['repeat', 'again', 'back']):
            predictions.append('repeat')

        # by default (inform)
        else:
            predictions.append('inform')

    return predictions


In [12]:
pred2 = rule_based_classifier(mydata)

In [13]:
evaluate(pred2,mydata)

Accuracy: 0.89


# ML Model

In [68]:
# imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [69]:
#convert text into Bag of Words representation
def preprocess_data(data):
    # sentence
    utterances = [utterance for _, utterance in data]

    # utterances into a Bag of Words representation
    vectorizer = CountVectorizer(lowercase=True)  # Remove stop_words
    X = vectorizer.fit_transform(utterances)  # Transform sentences into BoW
    return X, vectorizer

# Train the Decision Tree classifier
def train_decision_tree_classifier(X_train, y_train):
    # Adjusting hyperparameters to avoid overfitting and improve accuracy
    clf_tree = DecisionTreeClassifier(
        random_state=42,
        max_depth=20,  # Limits the depth of the tree
        min_samples_split=5,  # Minimum samples required to split an internal node
        criterion='entropy'  # Using entropy as the criterion (information gain)
    )
    clf_tree.fit(X_train, y_train)
    return clf_tree

# Evaluate the classifier's performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Decision Tree model accuracy: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))

# Interactive classification
def classify_sentence(model, vectorizer):
    while True:
        input_sentence = input("\nEnter a sentence (or 'exit' to quit): ")
        if input_sentence.lower() == 'exit':
            break
        input_bow = vectorizer.transform([input_sentence])
        prediction = model.predict(input_bow)
        print(f"The predicted dialog act is: {prediction[0]}")


In [70]:
# Extract the labels (dialog acts) from the tuples
labels = [dialog_act for dialog_act, _ in data]

# Step 2: Preprocess the data
X, vectorizer = preprocess_data(data)

# Split the data into training and testing sets (increased test size for better generalization)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.15, random_state=42)

# Step 3: Train the Decision Tree classifier
clf_tree = train_decision_tree_classifier(X_train, y_train)

# Step 4: Evaluate the model's performance
evaluate_model(clf_tree, X_test, y_test)

Decision Tree model accuracy: 0.95
              precision    recall  f1-score   support

         ack       0.00      0.00      0.00         5
      affirm       0.99      0.94      0.97       180
         bye       0.97      0.89      0.93        35
     confirm       0.78      0.82      0.80        22
        deny       0.00      0.00      0.00         6
       hello       1.00      0.43      0.60        14
      inform       0.91      0.99      0.95      1532
      negate       1.00      1.00      1.00        69
        null       0.98      0.72      0.83       232
      repeat       0.00      0.00      0.00         3
     reqalts       0.97      0.93      0.95       279
     reqmore       0.00      0.00      0.00         1
     request       0.99      0.97      0.98       972
     restart       0.00      0.00      0.00         2
    thankyou       1.00      1.00      1.00       474

    accuracy                           0.95      3826
   macro avg       0.64      0.58      0.60  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Interactive sentence classification
print("\nInteractive test with the Decision Tree model:")
classify_sentence(clf_tree, vectorizer)


Interactive test with the Decision Tree model:

Enter a sentence (or 'exit' to quit): more
The predicted dialog act is: inform

Enter a sentence (or 'exit' to quit): okay start over
The predicted dialog act is: inform

Enter a sentence (or 'exit' to quit): chinese
The predicted dialog act is: inform

Enter a sentence (or 'exit' to quit): cough
The predicted dialog act is: inform

Enter a sentence (or 'exit' to quit): jyf
The predicted dialog act is: inform

Enter a sentence (or 'exit' to quit): noise
The predicted dialog act is: null


In [72]:
# Accuracy - Training set
train_predictions = clf_tree.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Accuracy - Test set
test_predictions = clf_tree.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Training Accuracy: 0.9522029988465974
Test Accuracy: 0.9545216936748563
