In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf
import spacy

In [20]:
df = pd.read_csv("Training_data.csv")
df.head()

Unnamed: 0,Text,Category
0,"Hi, How are you?",Conversation
1,Open the Calculator,Automation
2,What is Computer Science?,Query
3,"Hello, How is it going",Conversation
4,Is it going to rain today in Mumabai?,Query


In [21]:
df['Category'].value_counts()

Category
Query           84
Automation      78
Conversation    69
Name: count, dtype: int64

In [22]:
df['Category'] = df['Category'].map({
    'Conversation' : 0,
    'Automation' : 1,
    'Query' : 2
})

In [23]:
df.head()

Unnamed: 0,Text,Category
0,"Hi, How are you?",0
1,Open the Calculator,1
2,What is Computer Science?,2
3,"Hello, How is it going",0
4,Is it going to rain today in Mumabai?,2


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], test_size = 0.2, random_state = 42, stratify = df['Category'])

In [None]:
#tokenizing the train and test data using Bert Tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_texts(texts, tokenizer, max_len = 128):
    return tokenizer(
        list(texts),
        max_length = max_len,
        padding = 'max_length', 
        truncation = True,
        return_tensors = 'tf'
    )
train_encodings = tokenize_texts(X_train, tokenizer)
test_encodings = tokenize_texts(X_test, tokenizer)

In [None]:
#tensorflow datasets

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.values
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.values
)).batch(16)

In [None]:
#loading the model
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)

#compile the model
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 5e-5)
model.compile(
    optimizer = optimizer, 
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    metrics = ['accuracy']
)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
#Training the model

history = model.fit(
    train_dataset, 
    validation_data = test_dataset,
    epochs = 3
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy:{accuracy}")

y_pred = tf.argmax(model.predict(test_dataset)[0], axis = -1)
print(classification_report(y_test, y_pred.numpy()))

Test Accuracy:0.978723406791687
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       1.00      1.00      1.00        16
           2       0.94      1.00      0.97        17

    accuracy                           0.98        47
   macro avg       0.98      0.98      0.98        47
weighted avg       0.98      0.98      0.98        47



In [34]:
model.save_pretrained("bert_classification_model")
tokenizer.save_pretrained("bert_classification_model")

('bert_classification_model/tokenizer_config.json',
 'bert_classification_model/special_tokens_map.json',
 'bert_classification_model/vocab.txt',
 'bert_classification_model/added_tokens.json')

In [39]:
# to load and reuse

from transformers import TFBertForSequenceClassification, BertTokenizer

model = TFBertForSequenceClassification.from_pretrained("bert_classification_model")
tokenizer = BertTokenizer.from_pretrained("bert_classification_model")

Some layers from the model checkpoint at bert_classification_model were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at bert_classification_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
#classifying the user input

def classify_user_input(input_sentence, model, tokenizer, label_mapping):
    encoding = tokenizer(
        input_sentence,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_tensors="tf"
    )
    
    # Get prediction from the model
    prediction = model.predict(dict(encoding))[0]
    predicted_label = tf.argmax(prediction, axis=-1).numpy()[0]
    
    # Map the predicted label to its category name
    label_names = {v: k for k, v in label_mapping.items()}
    return label_names[predicted_label]

In [None]:
label_mapping = {"Conversation" : 0, "Automation" : 1, "Query" : 2}

while True:
    user_input = input("Enter your sentence (or type 'exit' to quit):")
    if user_input.lower() == 'exit':
        print("Exited")
        break

    category = classify_user_input(user_input, model, tokenizer, label_mapping)
    print(f"The sentence is classified as : {category}")

The sentence is classified as : Conversation
The sentence is classified as : Automation
The sentence is classified as : Query
The sentence is classified as : Conversation
The sentence is classified as : Query
The sentence is classified as : Automation
The sentence is classified as : Automation
The sentence is classified as : Conversation
The sentence is classified as : Conversation
The sentence is classified as : Query
The sentence is classified as : Query
The sentence is classified as : Conversation
The sentence is classified as : Query
The sentence is classified as : Query
Exited
