#This Notebook is part of a homework assignment in NLP on an ATIS Dataset
###Different Types of models were implemented to perform the text classification of intenets on the given dataset


1.   Machine learning based models like logistic regression, NaiveBayesClassifier and SVM were implemented
2.   Two Rule based models were implemented to classify the intents, one was implemented on whole dataset and another one on top 8 most occuring intents
3.   Neural network and DNN models were also implemented as part of this homework.

At the end of the notebook, accuracy of all the models were compared.



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv")

df.columns = ['intent', 'utterance']
df.head()
log = pd.DataFrame(columns=['model','accuracy'])
res,acc = [],[]

In [None]:
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#Logistic Regression model- 1




In [None]:
# Preprocess the data
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["utterance"])
y = df["intent"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

res.append('LR1')
acc.append(accuracy)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.92


#NaiveBayes Classifier model - 2

In [None]:
import pandas as pd
import spacy
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')
data.columns = ['intent', 'utterance']

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

nlp = spacy.load("en_core_web_sm")

# Define the function to preprocess the text
def preprocess(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]

def bag_of_words(text):
    words = preprocess(text)
    return dict([(word, True) for word in words])

train_set = [(bag_of_words(text), intent) for text, intent in zip(train_data['utterance'], train_data['intent'])]
test_set = [(bag_of_words(text), intent) for text, intent in zip(test_data['utterance'], test_data['intent'])]

classifier = nltk.NaiveBayesClassifier.train(train_set)


y_pred = [classifier.classify(features) for features, _ in test_set]

# Compute the accuracy
accuracy = accuracy_score([intent for _, intent in test_set], y_pred)

res.append('NB')
acc.append(accuracy)
print("Accuracy:", accuracy)


Accuracy: 0.47690763052208834


#Machine Learning model SVC - 3

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')
data.columns = ['intent','text']


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()


X_train = vectorizer.fit_transform(train_data['text'])
y_train = train_data['intent']


X_test = vectorizer.transform(test_data['text'])
y_test = test_data['intent']


svm = SVC()

# Define the hyperparameters to search over
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10, 100]
}

grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train, y_train)


print('Best hyperparameters:', grid_search.best_params_)


y_pred = grid_search.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=1)
report1 = classification_report(y_test, y_pred, zero_division=1,output_dict=True)
res.append('SVM')
acc.append(report1['accuracy'])

print(report)




The least populated class in y has only 1 members, which is less than n_splits=5.



Best hyperparameters: {'C': 10, 'kernel': 'linear'}
                               precision    recall  f1-score   support

            atis_abbreviation       1.00      0.93      0.96        29
                atis_aircraft       0.86      0.92      0.89        13
                 atis_airfare       0.90      0.90      0.90        84
atis_airfare#atis_flight_time       1.00      0.00      0.00         1
                 atis_airline       0.88      0.92      0.90        25
  atis_airline#atis_flight_no       1.00      0.00      0.00         1
                 atis_airport       1.00      1.00      1.00         3
                atis_capacity       0.00      0.00      0.00         2
                    atis_city       1.00      0.50      0.67         4
                atis_distance       1.00      1.00      1.00         3
                  atis_flight       0.98      0.98      0.98       745
     atis_flight#atis_airfare       0.50      0.33      0.40         3
               atis_flig

#Rule Based model on top 8 inetents -4




In [None]:
import re
import pandas as pd
from sklearn.metrics import classification_report

def classify_intent(text):
    # Define patterns for each intent
    patterns = {
    "atis_flight": r"\b(flight|flight number|flight numbers|depart|departure|arrive|arrival)\b",
    "atis_airfare": r"\b(fare|price|cost)\b",
    "atis_ground_service": r"\b(rental car|car rental|rent car|shuttle|taxi|cab|limousine|limo)\b",
    "atis_airline": r"\b(airline|airlines|flight carrier)\b",
    "atis_abbreviation": r"\b(abbr|abbreviate|abbreviation)\b",
    "atis_aircraft": r"\b(aircraft|plane)\b",
    "atis_flight_time": r"\b(arrival time|departure time|time|duration|length)\b",
    "atis_quantity": r"\b(how many|how much)\b"
      }



    # Check if text matches any of the patterns
    for intent, pattern in patterns.items():
        if re.search(pattern, text, re.IGNORECASE):
            return intent

    # If no pattern is matched, return None
    return "unknown"

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')
data.columns = ["intent", "text"]

# Filter the dataset based on specific intent labels
intent_labels = ['atis_flight', 'atis_airfare', 'atis_ground_service', 'atis_airline',
               'atis_abbreviation', 'atis_aircraft', 'atis_flight_time', 'atis_quantity']
df = data[data['intent'].isin(intent_labels)]

# Apply the rule-based model to each text in the dataset and store the predictions
predictions = []
for text in df['text']:
    intent = classify_intent(text)
    predictions.append(intent)

# Filter the predictions based on the same intent labels we used to filter the df
filtered_predictions = [p if p in intent_labels else "unknown" for p in predictions]

# Print the classification report

report = classification_report(df['intent'], filtered_predictions, zero_division=1,output_dict=True)
report1 = classification_report(df['intent'], filtered_predictions, zero_division=1)
res.append('RBM-1')
acc.append(report['accuracy'])
print(report1)
print(acc)

                     precision    recall  f1-score   support

  atis_abbreviation       1.00      0.03      0.07       147
      atis_aircraft       0.94      0.58      0.72        81
       atis_airfare       0.69      0.45      0.54       423
       atis_airline       0.39      0.92      0.55       157
        atis_flight       0.92      0.32      0.47      3665
   atis_flight_time       0.29      0.07      0.12        54
atis_ground_service       1.00      0.07      0.13       255
      atis_quantity       0.83      0.57      0.67        51
            unknown       0.00      1.00      0.00         0

           accuracy                           0.33      4833
          macro avg       0.67      0.45      0.36      4833
       weighted avg       0.88      0.33      0.45      4833

[0.9186746987951807, 0.47690763052208834, 0.33167804676184565]


# Rule Based model on complete dataset -5

In [None]:
import pandas as pd
import spacy
from spacy.matcher import Matcher

# Load the dataset and rename columns
df = pd.read_csv("https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv")
df.columns = ["intent", "utterance"]

# Load the English language model in spacy
nlp = spacy.load("en_core_web_sm")

# Define the rules for the different intents
rules = {
    "atis_flight": [{"LOWER": "flight"}],
    "atis_flight_time": [{"LOWER": "flight"}, {"LOWER": "time"}],
    "atis_airfare": [{"LOWER": "airfare"}],
    "atis_aircraft": [{"LOWER": "aircraft"}],
    "atis_ground_service": [{"LOWER": "ground"}, {"LOWER": "service"}],
    "atis_airport": [{"LOWER": "airport"}],
    "atis_airline": [{"LOWER": "airline"}],
    "atis_distance": [{"LOWER": "distance"}],
    "atis_abbreviation": [{"LOWER": "abbreviation"}],
    "atis_ground_fare": [{"LOWER": "ground"}, {"LOWER": "fare"}],
    "atis_quantity": [{"LOWER": "quantity"}],
    "atis_city": [{"LOWER": "city"}],
    "atis_flight_no": [{"LOWER": "flight"}, {"LOWER": "number"}],
    "atis_meal": [{"LOWER": "meal"}],
    "atis_restriction": [{"LOWER": "restriction"}],
    "atis_cheapest": [{"LOWER": "cheapest"}],
    "atis_airline#atis_flight_no": [{"LOWER": "airline"}, {"LOWER": "flight"}, {"LOWER": "number"}],
    "atis_airfare#atis_flight_time": [{"LOWER": "airfare"}, {"LOWER": "flight"}, {"LOWER": "time"}],
    "atis_ground_service#atis_ground_fare": [{"LOWER": "ground"}, {"LOWER": "service"}, {"LOWER": "fare"}],
    "atis_flight#atis_airfare": [{"LOWER": "flight"}, {"LOWER": "airfare"}],
    "atis_capacity": [{"LOWER": "capacity"}],
}

# Initialize the matcher with the shared vocabulary
matcher = Matcher(nlp.vocab)

# Add the rules to the matcher
for intent, rule in rules.items():
    matcher.add(intent, [rule])

# Define a function to classify the intents
def classify_intent(text):
    doc = nlp(text.lower())
    matches = matcher(doc)
    if matches:
        match_id, start, end = matches[0]
        return nlp.vocab.strings[match_id]
    else:
        return "unknown"

# Test the classifier on the dataset
correct = 0
total = 0
for i, row in df.iterrows():
    intent = row["intent"]
    utterance = row["utterance"]
    predicted_intent = classify_intent(utterance)
    if predicted_intent == intent:
        correct += 1
    total += 1

accuracy = correct / total
res.append('RBM-2')
acc.append(accuracy)

print("Accuracy:", accuracy)


Accuracy: 0.21780188868796463


# Neural Network - 6

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')

data.columns = ["intent", "text"]


data = data.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into training and test sets
train_data = data[:int(0.8 * len(data))]
test_data = data[int(0.8 * len(data)):]

# Tokenize the text data
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad the sequences to a fixed length
max_length = max([len(seq) for seq in train_sequences + test_sequences])
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=max_length)
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_length)

# one-hot encoded vectors
label_encoder = LabelEncoder()
label_encoder.fit(data['intent'])
train_labels = tf.keras.utils.to_categorical(label_encoder.transform(train_data['intent']))
test_labels = tf.keras.utils.to_categorical(label_encoder.transform(test_data['intent']))

# model architecture
inputs = Input(shape=(max_length,))
x = Dense(128, activation='relu')(inputs)
x = Dense(64, activation='relu')(x)
outputs = Dense(len(label_encoder.classes_), activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)

# model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.fit(train_sequences, train_labels, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_sequences, test_labels)
res.append('NN1')
acc.append(accuracy)
print('Test loss: {:.2f}'.format(loss))
print('Test accuracy: {:.2f}'.format(accuracy))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 2.27
Test accuracy: 0.69


# Neural Network - 7

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')

data.columns = ["intent", "text"]


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_labels = set(train_data['intent'])
test_labels = set(test_data['intent'])
common_labels = train_labels.intersection(test_labels)

train_data = train_data[train_data['intent'].isin(common_labels)]
test_data = test_data[test_data['intent'].isin(common_labels)]
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pading
maxlen = 50
train_sequences = pad_sequences(train_sequences, padding='post', maxlen=maxlen)
test_sequences = pad_sequences(test_sequences, padding='post', maxlen=maxlen)

# one-hot encoded vectors
label_encoder = tf.keras.utils.to_categorical(train_data['intent'].factorize()[0])
num_classes = label_encoder.shape[1]

# model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50, input_length=maxlen),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


model.fit(train_sequences, label_encoder, epochs=20, batch_size=64)

# Evaluate the model on test data
test_label_encoder = tf.keras.utils.to_categorical(test_data['intent'].factorize()[0])
loss, accuracy = model.evaluate(test_sequences, test_label_encoder, verbose=0)
res.append('NN2')
acc.append(accuracy)
print('Test loss: {:.2f}'.format(loss))
print('Test accuracy: {:.2f}'.format(accuracy))


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 50, 50)            42150     
                                                                 
 dropout_11 (Dropout)        (None, 50, 50)            0         
                                                                 
 conv1d_4 (Conv1D)           (None, 46, 128)           32128     
                                                                 
 global_max_pooling1d_4 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_25 (Dense)            (None, 128)               16512     
                                                                 
 dropout_12 (Dropout)        (None, 128)               0         
                                                     

# Neural Network - 8

In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense


data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')

data.columns = ["intent", "text"]


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


train_labels = set(train_data['intent'])
test_labels = set(test_data['intent'])


common_labels = train_labels.intersection(test_labels)

# Check if there are any common labels
if not common_labels:
    print('No common labels between train and test sets')
    exit()

# Filter the train and test sets to only include instances with common labels
train_intents = train_data[train_data['intent'].isin(common_labels)]
test_intents = test_data[test_data['intent'].isin(common_labels)]

# Encode labels
label_encoder = LabelEncoder()
label_encoder.fit(train_intents['intent'])
train_labels = label_encoder.transform(train_intents['intent'])
test_labels = label_encoder.transform(test_intents['intent'])

#one-hot encoded vectors
num_classes = len(common_labels)
train_labels = to_categorical(train_labels, num_classes=num_classes)
test_labels = to_categorical(test_labels, num_classes=num_classes)

# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_intents['text'])
vocab_size = len(tokenizer.word_index) + 1
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_intents['text'])
maxlen = 100
trunc_type='post'
padding_type='post'
embedding_dim = 100
validation_split = 0.1

train_sequences = tokenizer.texts_to_sequences(train_intents['text'])
train_sequences = pad_sequences(train_sequences, maxlen=maxlen, truncating=trunc_type, padding=padding_type)
test_sequences = tokenizer.texts_to_sequences(test_intents['text'])
test_sequences = pad_sequences(test_sequences, maxlen=maxlen, truncating=trunc_type, padding=padding_type)

# Build model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=maxlen),
    GlobalAveragePooling1D(),
    Dense(num_classes, activation='softmax')
])
model.summary()
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train model
history = model.fit(train_sequences, train_labels, epochs=20, validation_split=validation_split, verbose=1)

# Evaluate model on test data
loss, accuracy = model.evaluate(test_sequences, test_labels, verbose=0)
res.append('NN3')
acc.append(accuracy)
print('Test loss:', loss)
print('Test accuracy:', accuracy)


Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (None, 100, 100)          84300     
                                                                 
 global_average_pooling1d_5   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_27 (Dense)            (None, 16)                1616      
                                                                 
Total params: 85,916
Trainable params: 85,916
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 0.524

# Neural Network - 9

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical



data = pd.read_csv('https://raw.githubusercontent.com/PacktPublishing/Mastering-spaCy/main/Chapter06/data/atis_intents.csv')

data.columns = ["intent", "text"]


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_labels = set(train_data['intent'])
test_labels = set(test_data['intent'])
common_labels = train_labels.intersection(test_labels)

train_df = train_data[train_data['intent'].isin(common_labels)]
test_df = test_data[test_data['intent'].isin(common_labels)]


label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['intent'])
test_labels = label_encoder.transform(test_df['intent'])

num_classes = len(np.unique(train_labels))
train_labels = to_categorical(train_labels, num_classes)
test_labels = to_categorical(test_labels, num_classes)

# Tokenize text
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_df['text'])
vocab_size = len(tokenizer.word_index) + 1
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Pad sequences
maxlen = max([len(x) for x in train_sequences])
train_sequences = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=maxlen, padding='post')
test_sequences = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=maxlen, padding='post')


# model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 16, input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])

# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(train_sequences, train_labels, epochs=10, validation_split=0.2, verbose=1)


# Evaluate model on test data
test_loss, test_acc = model.evaluate(test_sequences, test_labels, verbose=1)
res.append('NN4')
acc.append(test_acc)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.19097554683685303
Test Accuracy: 0.9437186121940613


#Comparision

In [None]:
log

Unnamed: 0,model,accuracy
0,LR1,0.918675
1,NB,0.476908
2,RBM-1,0.331678
3,RBM-2,0.217802
4,NN1,0.694779
5,NN2,0.736683
6,NN3,0.875377
7,NN4,0.943719
