In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
[0mCollecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.w

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline,
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


file_path = "/content/educational_dataset+non.csv"
data = pd.read_csv(file_path)


data = data.dropna()
data['Query'] = data['Query'].str.strip()


label_mapping = {"educational": 1, "noneducational": 0}
data["Classification"] = data["Classification"].map(label_mapping)


print(f"Class distribution:\n{data['Classification'].value_counts()}")


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)


def preprocess_data(examples):

    inputs = tokenizer(examples["Query"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = examples["Classification"]
    return inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)


train_dataset = train_dataset.remove_columns(["Query", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["Query", "__index_level_0__"])
train_dataset.set_format("torch")
test_dataset.set_format("torch")


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    greater_is_better=True
)


def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


classifier = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model", return_all_scores=True)

while True:
    user_input = input("Enter a query to classify (or type 'exit' to stop): ")
    if user_input.lower() == "exit":
        break
    result = classifier(user_input)
    educational_score = result[0][1]["score"]
    non_educational_score = result[0][0]["score"]
    if educational_score > non_educational_score:
        print(f"The query is classified as: Educational (Score: {educational_score:.2f})")
    else:
        print(f"The query is classified as: Non-Educational (Score: {non_educational_score:.2f})")


Class distribution:
Classification
1    9893
0    6874
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


In [None]:
!zip -r /content/fine_tuned_model.zip /content/fine_tuned_model



  adding: content/fine_tuned_model/ (stored 0%)
  adding: content/fine_tuned_model/tokenizer_config.json (deflated 76%)
  adding: content/fine_tuned_model/config.json (deflated 46%)
  adding: content/fine_tuned_model/special_tokens_map.json (deflated 42%)
  adding: content/fine_tuned_model/model.safetensors (deflated 8%)
  adding: content/fine_tuned_model/vocab.txt (deflated 53%)
  adding: content/fine_tuned_model/tokenizer.json (deflated 71%)


In [None]:
from google.colab import files
files.download('/content/fine_tuned_model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## YA Dakhna Sirf..................**bold text**

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import shutil


file_path = "/content/educational_dataset+non.csv"
data = pd.read_csv(file_path)


data = data.dropna()
data['Query'] = data['Query'].str.strip()


label_mapping = {"educational": 1, "noneducational": 0}
data["Classification"] = data["Classification"].map(label_mapping)


print(f"Class distribution:\n{data['Classification'].value_counts()}")


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)


tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)


def preprocess_data(examples):

    inputs = tokenizer(examples["Query"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = examples["Classification"]
    return inputs

train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)


train_dataset = train_dataset.remove_columns(["Query", "__index_level_0__"])
test_dataset = test_dataset.remove_columns(["Query", "__index_level_0__"])
train_dataset.set_format("torch")
test_dataset.set_format("torch")


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    metric_for_best_model="accuracy",
    greater_is_better=True
)

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()


model_dir = "./fine_tuned_model"
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

import shutil
zip_filename = "/content/fine_tuned_model.zip"
shutil.make_archive(zip_filename.replace(".zip", ""), 'zip', model_dir)

print(f"Model saved as {zip_filename}")


Class distribution:
Classification
1    9893
0    6874
Name: count, dtype: int64


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0021,0.003707,0.999404,0.998965,1.0,0.999482
2,0.0001,0.002853,0.999404,0.998965,1.0,0.999482
3,0.0001,0.002583,0.999404,0.998965,1.0,0.999482


Model saved as /content/fine_tuned_model.zip


In [None]:
from google.colab import files


files.download('/content/fine_tuned_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    pipeline,
    DataCollatorWithPadding,
)
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd
import torch


file_path = "/content/educational_dataset+non.csv"

data = pd.read_csv(file_path)


data = data.dropna()
data['Query'] = data['Query'].str.strip()
label_mapping = {"educational": 1, "noneducational": 0}
data["Classification"] = data["Classification"].map(label_mapping)


train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)


model_names = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base"
]


def preprocess_data(examples):
    inputs = tokenizer(examples["Query"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = examples["Classification"]
    return inputs


def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


results = []


for model_name in model_names:
    print(f"\nTraining and Evaluating Model: {model_name}")


    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)


    train_dataset = train_dataset.map(preprocess_data, batched=True)
    test_dataset = test_dataset.map(preprocess_data, batched=True)


    train_dataset = train_dataset.remove_columns(["Query", "__index_level_0__"])
    test_dataset = test_dataset.remove_columns(["Query", "__index_level_0__"])
    train_dataset.set_format("torch")
    test_dataset.set_format("torch")


    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


    training_args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        learning_rate=5e-5,
        logging_dir=f"./logs/{model_name}",
        logging_steps=10,
        save_total_limit=2,
        metric_for_best_model="accuracy",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )


    trainer.train()


    metrics = trainer.evaluate()
    print(f"Evaluation metrics for {model_name}: {metrics}")


    predictions = trainer.predict(test_dataset)
    logits = predictions.predictions
    preds = torch.argmax(torch.tensor(logits), dim=-1).numpy()
    labels = predictions.label_ids


    report = classification_report(labels, preds, target_names=["noneducational", "educational"])
    print(report)


    results.append({
        "model_name": model_name,
        "metrics": metrics,
        "classification_report": report
    })


    print(f"\nTesting the model: {model_name}")
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

    while True:
        user_input = input("\nEnter a query to classify (or type 'exit' to stop): ")
        if user_input.lower() == "exit":
            break
        result = classifier(user_input)
        educational_score = result[0][1]["score"]
        non_educational_score = result[0][0]["score"]

        if educational_score > non_educational_score:
            print(f"The query is classified as: Educational (Score: {educational_score:.2f})")
        else:
            print(f"The query is classified as: Non-Educational (Score: {non_educational_score:.2f})")



Training and Evaluating Model: distilbert-base-uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/13413 [00:00<?, ? examples/s]

Map:   0%|          | 0/3354 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0008,0.003232,0.999404,0.998965,1.0,0.999482
