In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM

In [7]:
train = pd.read_csv('datasets/hwu/train.csv')
val = pd.read_csv('datasets/hwu/val.csv')

In [8]:
print("Train shape:", train.shape)
print("Validation shape:", val.shape)
train.head()

Train shape: (8954, 2)
Validation shape: (1076, 2)


Unnamed: 0,text,category
0,what alarms do i have set right now,alarm_query
1,checkout today alarm of meeting,alarm_query
2,report alarm settings,alarm_query
3,see see for me the alarms that you have set to...,alarm_query
4,is there an alarm for ten am,alarm_query


In [22]:
le = LabelEncoder()
train_labels = le.fit_transform(train['category'])
val_labels = le.transform(val['category'])
num_classes = len(set(train_labels))
num_classes

64

### TF-IDF + Logistic Regression

In [10]:
tfidf_lr_pipeline = make_pipeline(
    TfidfVectorizer(max_features=5000),
    LogisticRegression(max_iter=1000)
)

In [11]:
tfidf_lr_pipeline.fit(train['text'], train_labels)

In [12]:
y_pred = tfidf_lr_pipeline.predict(val['text'])
print(classification_report(val_labels, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.95      0.86        19
           1       0.75      0.55      0.63        11
           2       0.74      0.74      0.74        19
           3       0.62      0.62      0.62         8
           4       1.00      0.53      0.70        15
           5       0.79      0.85      0.81        13
           6       0.77      0.53      0.62        19
           7       0.82      0.95      0.88        19
           8       0.88      0.79      0.83        19
           9       0.94      0.89      0.92        19
          10       0.56      0.62      0.59         8
          11       0.82      0.74      0.78        19
          12       1.00      0.88      0.93         8
          13       0.94      0.89      0.92        19
          14       0.94      0.84      0.89        19
          15       0.90      0.95      0.92        19
          16       1.00      0.89      0.94        19
          17       0.90    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Word2Vec + Dense Layer

In [13]:
sentences = [text.split() for text in train['text']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [15]:
def sentence_to_avg_vector(text, model):
    words = text.split()
    vectors = []

    for w in words:
        if w in model.wv:
            vectors.append(model.wv[w])

    if len(vectors) == 0:
        return np.zeros(model.vector_size)

    return np.mean(vectors, axis=0)

In [17]:
def convert_df_to_vectors(df, text_column, model):
    X = np.array([sentence_to_avg_vector(text, model)
                  for text in df[text_column]])
    return X

In [19]:
X_train_avg = convert_df_to_vectors(train, "text", w2v_model)
X_val_avg   = convert_df_to_vectors(val, "text", w2v_model)

In [40]:
model = Sequential([
    Input((w2v_model.vector_size,)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

history = model.fit(X_train_avg, train_labels, epochs=20, batch_size=32, validation_data=(X_val_avg, val_labels))

Epoch 1/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0218 - loss: 4.1468 - val_accuracy: 0.0530 - val_loss: 4.0165
Epoch 2/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0582 - loss: 3.8900 - val_accuracy: 0.0883 - val_loss: 3.5793
Epoch 3/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1012 - loss: 3.5280 - val_accuracy: 0.1059 - val_loss: 3.4674
Epoch 4/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1228 - loss: 3.3686 - val_accuracy: 0.1375 - val_loss: 3.3846
Epoch 5/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1436 - loss: 3.2780 - val_accuracy: 0.1468 - val_loss: 3.2138
Epoch 6/20
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.1559 - loss: 3.1793 - val_accuracy: 0.1812 - val_loss: 3.1184
Epoch 7/20
[1m280/280[0m 

In [41]:
y_pred = np.argmax(model.predict(X_val_avg), axis=1)
print(classification_report(val_labels, y_pred))

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

           0       0.29      0.63      0.40        19
           1       0.62      0.45      0.53        11
           2       0.48      0.84      0.62        19
           3       0.29      0.25      0.27         8
           4       0.12      0.07      0.09        15
           5       0.14      0.08      0.10        13
           6       0.00      0.00      0.00        19
           7       0.34      0.58      0.43        19
           8       0.25      0.21      0.23        19
           9       0.00      0.00      0.00        19
          10       0.00      0.00      0.00         8
          11       0.18      0.58      0.28        19
          12       0.40      0.25      0.31         8
          13       0.07      0.05      0.06        19
          14       0.12      0.05      0.07        19
          15       0.38      0.16      0.22        19
        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
y_pred.shape

(1076, 64)