### Importing the dataset

In [1]:
import pandas as pd
data=pd.read_csv('sofmattress_train.csv')
data.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,EMI
1,Do you offer Zero Percent EMI payment options?,EMI
2,0% EMI.,EMI
3,EMI,EMI
4,I want in installment,EMI


### What are the classification labels

In [2]:
data['label'].unique()

array(['EMI', 'COD', 'ORTHO_FEATURES', 'ERGO_FEATURES', 'COMPARISON',
       'WARRANTY', '100_NIGHT_TRIAL_OFFER', 'SIZE_CUSTOMIZATION',
       'WHAT_SIZE_TO_ORDER', 'LEAD_GEN', 'CHECK_PINCODE', 'DISTRIBUTORS',
       'MATTRESS_COST', 'PRODUCT_VARIANTS', 'ABOUT_SOF_MATTRESS',
       'DELAY_IN_DELIVERY', 'ORDER_STATUS', 'RETURN_EXCHANGE',
       'CANCEL_ORDER', 'PILLOWS', 'OFFERS'], dtype=object)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  328 non-null    object
 1   label     328 non-null    object
dtypes: object(2)
memory usage: 5.2+ KB


#### No null values

### Initial approach was to use a pre-trained transformer like BERT but since there are only ~300 rows, it can lead to overfitting and so going with traditional ML models

### Preprocessing + Tokenization + Stop words removal

In [4]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_sentence'] = data['sentence'].apply(preprocess_text)
data[['sentence', 'cleaned_sentence']].head()

[nltk_data] Downloading package stopwords to C:\Users\Shruthi
[nltk_data]     Mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Shruthi
[nltk_data]     Mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sentence,cleaned_sentence
0,You guys provide EMI option?,guys provide emi option
1,Do you offer Zero Percent EMI payment options?,offer zero percent emi payment options
2,0% EMI.,emi
3,EMI,emi
4,I want in installment,want installment


### Label Encoding to use nerual network

In [5]:
X = data['cleaned_sentence']
y = data['label']

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

### Splitting to training and testing data

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)
# 0.2 test split

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Training set size: 262
Test set size: 66


### Text Embedding using the BERT (gives the best contextual embeddings)

In [7]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.eval()

def get_bert_embeddings(text_list, model, tokenizer, max_length=128):
    embeddings = []
    
    for text in text_list:
        inputs = tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=max_length
        )
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    
    return np.array(embeddings)

#### Generate BERT embeddings for train and test data

In [8]:
# Training data embedding
X_train_embeddings = get_bert_embeddings(X_train.tolist(), bert_model, tokenizer)

#Test data embedding
X_test_embeddings = get_bert_embeddings(X_test.tolist(), bert_model, tokenizer)

### Training using Neural Network

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding for Keras
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Adding dropout for regularization
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_embeddings.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               196864    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 21)                2709      
                                                                 
Total params: 232,469
Trainable params: 232,469
Non-trainable params: 0
_________________________________________________________________


In [10]:
history = model.fit(
    X_train_embeddings, y_train_categorical,
    validation_data=(X_test_embeddings, y_test_categorical),
    epochs=20,
    batch_size=32,
    verbose=1
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluation

In [11]:
# Evaluate performance
test_loss, test_accuracy = model.evaluate(X_test_embeddings, y_test_categorical)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.65


### Predicting using test data

In [12]:
y_pred = model.predict(X_test_embeddings)
y_pred_labels = y_pred.argmax(axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_labels, target_names=label_encoder.classes_))

                       precision    recall  f1-score   support

100_NIGHT_TRIAL_OFFER       0.43      0.75      0.55         4
   ABOUT_SOF_MATTRESS       0.50      0.50      0.50         2
         CANCEL_ORDER       1.00      1.00      1.00         2
        CHECK_PINCODE       1.00      0.50      0.67         2
                  COD       1.00      1.00      1.00         2
           COMPARISON       0.67      1.00      0.80         2
    DELAY_IN_DELIVERY       1.00      0.50      0.67         2
         DISTRIBUTORS       1.00      1.00      1.00         7
                  EMI       0.50      0.20      0.29         5
        ERGO_FEATURES       0.00      0.00      0.00         2
             LEAD_GEN       0.25      0.50      0.33         4
        MATTRESS_COST       0.62      1.00      0.77         5
               OFFERS       1.00      0.50      0.67         2
         ORDER_STATUS       1.00      0.50      0.67         4
       ORTHO_FEATURES       1.00      0.75      0.86  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Calculating the class weights

In [13]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights_dict = dict(zip(np.unique(y_encoded), class_weights))

### Fitting and Evaluating the model with class weights

In [15]:
model.fit(
    X_train_embeddings,
    y_train_categorical,
    validation_data=(X_test_embeddings, y_test_categorical),
    epochs=10,
    batch_size=32,
    class_weight=class_weights_dict
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x25b1c83cee0>

In [16]:
# Evaluate performance
test_loss, test_accuracy = model.evaluate(X_test_embeddings, y_test_categorical)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.68


### Since there isnt much difference in the accuracy using class weights, we can ignore them

### Hyper Parameter Tuning using GridSearchCV

In [14]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# Function to build model for tuning
def build_model(optimizer='adam', dropout_rate=0.3, dense_units=256):
    model = Sequential([
        Dense(dense_units, activation='relu', input_shape=(X_train_embeddings.shape[1],)),
        Dropout(dropout_rate),
        Dense(dense_units // 2, activation='relu'),
        Dropout(dropout_rate),
        Dense(len(np.unique(y_encoded)), activation='softmax')
    ])
    
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

model_for_tuning = KerasClassifier(build_fn=build_model, epochs=20, batch_size=32, verbose=0)
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'dropout_rate': [0.3, 0.5],
    'dense_units': [256, 512],
    'batch_size': [32, 64],
    'epochs': [10, 20]
}

grid_search = GridSearchCV(estimator=model_for_tuning, param_grid=param_grid, cv=3, verbose=1)
grid_search_result = grid_search.fit(X_train_embeddings, y_train_categorical)

  model_for_tuning = KerasClassifier(build_fn=build_model, epochs=20, batch_size=32, verbose=0)


Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [15]:
print(f"Best Hyperparameters: {grid_search_result.best_params_}")
best_model = grid_search_result.best_estimator_
best_model.score(X_test_embeddings, y_test_categorical)

Best Hyperparameters: {'batch_size': 32, 'dense_units': 512, 'dropout_rate': 0.3, 'epochs': 20, 'optimizer': 'adam'}


0.6969696879386902

### Tweaking parameters

In [16]:
param_grid = {
    'dropout_rate': [0.2, 0.3, 0.4],
    'dense_units': [512, 1024],
    'batch_size': [16, 32],
    'epochs': [20, 30]
}

grid_search = GridSearchCV(estimator=model_for_tuning, param_grid=param_grid, cv=3, verbose=1)
grid_search_result = grid_search.fit(X_train_embeddings, y_train_categorical)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [17]:
print(f"Best Hyperparameters: {grid_search_result.best_params_}")
best_model = grid_search_result.best_estimator_
best_model.score(X_test_embeddings, y_test_categorical)

Best Hyperparameters: {'batch_size': 16, 'dense_units': 512, 'dropout_rate': 0.4, 'epochs': 30}


0.6969696879386902

ACCURACY ~ 70%