In [4]:
# Install necessary libraries
!pip install transformers datasets torch scikit-learn

# Import libraries
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import torch

# Step 1: Load Dataset
from google.colab import files
uploaded = files.upload()

df = pd.read_csv("educational_dataset+non.csv")
df = df.rename(columns={"Query": "text", "Classification": "label"})


df["label"] = df["label"].map({"educational": 1, "noneducational": 0})

dataset = Dataset.from_pandas(df[["text", "label"]])
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]


def preprocess_data(batch, tokenizer):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

def compute_metrics(pred):
    predictions = pred.predictions.argmax(axis=1)
    labels = pred.label_ids
    return {"accuracy": accuracy_score(labels, predictions)}


models = {
    "distilbert": "distilbert-base-uncased",
    "roberta": "roberta-base",
    "bert": "bert-base-uncased",
}

model_results = {}

for model_name, model_path in models.items():
    print(f"\nTraining {model_name}...")


    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)


    train_dataset = train_dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)
    test_dataset = test_dataset.map(lambda x: preprocess_data(x, tokenizer), batched=True)


    train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


    training_args = TrainingArguments(
        output_dir=f"./{model_name}_results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()


    metrics = trainer.evaluate()
    model_results[model_name] = {
        "accuracy": metrics["eval_accuracy"],
        "model": model,
        "tokenizer": tokenizer,
    }

def classify_query(query):
    print("\nResults for the input query:")
    for model_name, details in model_results.items():
        model = details["model"]
        tokenizer = details["tokenizer"]


        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)


        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=1).item()
        label = "educational" if prediction == 1 else "noneducational"

        print(f"{model_name.capitalize()}: Predicted as '{label}' with accuracy {details['accuracy']:.4f}")


while True:
    user_query = input("\nEnter a query to classify (or type 'exit' to stop): ")
    if user_query.lower() == "exit":
        break
    classify_query(user_query)




Saving educational_dataset+non.csv to educational_dataset+non (2).csv

Training distilbert...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0119,0.00178,0.999702
2,0.0069,0.001384,0.999702
3,0.0057,0.001995,0.999702



Training roberta...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0113,0.048082,0.991055
2,0.0087,4.1e-05,1.0
3,0.0049,9.6e-05,1.0



Training bert...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0149,0.010266,0.997018
2,0.006,0.0008,0.999702
3,0.0057,9.5e-05,1.0



Enter a query to classify (or type 'exit' to stop): RAM

Results for the input query:


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification


model_name = "bert-base-uncased"
model.save_pretrained(f"./{model_name}_model")
tokenizer.save_pretrained(f"./{model_name}_tokenizer")



('./bert-base-uncased_tokenizer/tokenizer_config.json',
 './bert-base-uncased_tokenizer/special_tokens_map.json',
 './bert-base-uncased_tokenizer/vocab.txt',
 './bert-base-uncased_tokenizer/added_tokens.json',
 './bert-base-uncased_tokenizer/tokenizer.json')

In [None]:
model.save_pretrained("./distilbert-base-uncased_model")
tokenizer.save_pretrained("./distilbert-base-uncased_tokenizer")


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch


model_results = {}

model_names = ["bert-base-uncased", "roberta-base", "distilbert-base-uncased"]


for model_name in model_names:
    model_path = f"./{model_name}_model"
    tokenizer_path = f"./{model_name}_tokenizer"

    try:

        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

        accuracy = 0.85


        model_results[model_name] = {
            "model": model,
            "tokenizer": tokenizer,
            "accuracy": accuracy
        }

        print(f"{model_name} loaded successfully.")

    except Exception as e:
        print(f"Error loading {model_name}: {e}")


def classify_query(query):
    print("\nResults for the input query:")


    for model_name, details in model_results.items():
        model = details["model"]
        tokenizer = details["tokenizer"]


        device = model.device


        inputs = tokenizer(query, return_tensors="pt", truncation=True, padding=True, max_length=128)


        inputs = {key: value.to(device) for key, value in inputs.items()}

        # Get prediction
        model.eval()
        with torch.no_grad():
            outputs = model(**inputs)

        # Get the predicted label
        prediction = torch.argmax(outputs.logits, dim=1).item()
        label = "educational" if prediction == 1 else "noneducational"

        # Print the results
        print(f"{model_name.capitalize()}: Predicted as '{label}' with accuracy {details['accuracy']:.4f}")

# Input loop to take query from user
while True:
    user_query = input("Enter a query to classify (or type 'exit' to stop): ")

    # Exit condition
    if user_query.lower() == "exit":
        break

    # Classify the query using all the models
    classify_query(user_query)


bert-base-uncased loaded successfully.
roberta-base loaded successfully.
Error loading distilbert-base-uncased: Incorrect path_or_model_id: './distilbert-base-uncased_model'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
Enter a query to classify (or type 'exit' to stop): RAM

Results for the input query:
Bert-base-uncased: Predicted as 'noneducational' with accuracy 0.8500
Roberta-base: Predicted as 'noneducational' with accuracy 0.8500
Enter a query to classify (or type 'exit' to stop): RAM in operating System

Results for the input query:
Bert-base-uncased: Predicted as 'educational' with accuracy 0.8500
Roberta-base: Predicted as 'educational' with accuracy 0.8500
Enter a query to classify (or type 'exit' to stop): DS

Results for the input query:
Bert-base-uncased: Predicted as 'noneducational' with accuracy 0.8500
Roberta-base: Predicted as 'noneducational' with accuracy 0.8500
Enter a query to classify (or type 'exit' to stop): podcast ra