### Importing the dataset

In [12]:
import pandas as pd
data=pd.read_csv('sofmattress_train.csv')
data.head()

Unnamed: 0,sentence,label
0,You guys provide EMI option?,EMI
1,Do you offer Zero Percent EMI payment options?,EMI
2,0% EMI.,EMI
3,EMI,EMI
4,I want in installment,EMI


### What are the classification labels

In [13]:
data['label'].unique()

array(['EMI', 'COD', 'ORTHO_FEATURES', 'ERGO_FEATURES', 'COMPARISON',
       'WARRANTY', '100_NIGHT_TRIAL_OFFER', 'SIZE_CUSTOMIZATION',
       'WHAT_SIZE_TO_ORDER', 'LEAD_GEN', 'CHECK_PINCODE', 'DISTRIBUTORS',
       'MATTRESS_COST', 'PRODUCT_VARIANTS', 'ABOUT_SOF_MATTRESS',
       'DELAY_IN_DELIVERY', 'ORDER_STATUS', 'RETURN_EXCHANGE',
       'CANCEL_ORDER', 'PILLOWS', 'OFFERS'], dtype=object)

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 328 entries, 0 to 327
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  328 non-null    object
 1   label     328 non-null    object
dtypes: object(2)
memory usage: 5.2+ KB


#### No null values

### Initial approach was to use a pre-trained transformer like BERT + Deep Learning architecture but since there are only ~300 rows, it can lead to overfitting and so going with traditional ML models

### Preprocessing + Tokenization + Stop words removal

In [15]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_sentence'] = data['sentence'].apply(preprocess_text)
data[['sentence', 'cleaned_sentence']].head()

[nltk_data] Downloading package stopwords to C:\Users\Shruthi
[nltk_data]     Mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Shruthi
[nltk_data]     Mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,sentence,cleaned_sentence
0,You guys provide EMI option?,guys provide emi option
1,Do you offer Zero Percent EMI payment options?,offer zero percent emi payment options
2,0% EMI.,emi
3,EMI,emi
4,I want in installment,want installment


### Splitting to training and testing data

In [19]:
X = data['cleaned_sentence']
y = data['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# 0.2 test split

print("Training set size:", len(X_train))
print("Test set size:", len(X_test))

Training set size: 262
Test set size: 66


### Text Embedding using the BERT (gives the best contextual embeddings)

In [22]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np 

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.eval()

def get_bert_embeddings(text_list, model, tokenizer, max_length=128):
    embeddings = []
    
    for text in text_list:
        inputs = tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=max_length
        )
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
        embeddings.append(cls_embedding)
    
    return np.array(embeddings)

#### Generate BERT embeddings for train and test data

In [23]:
# Training data embedding
X_train_embeddings = get_bert_embeddings(X_train.tolist(), bert_model, tokenizer)

#Test data embedding
X_test_embeddings = get_bert_embeddings(X_test.tolist(), bert_model, tokenizer)

### Training using Random Forest Model

In [24]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_embeddings, y_train)

### Predicting using test data

In [26]:
y_pred = clf.predict(X_test_embeddings)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.55


### Saving the BERT model and embeddings

In [27]:
import pickle

with open('bert_rf_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

tokenizer.save_pretrained('./bert_tokenizer')
bert_model.save_pretrained('./bert_model')