In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers datasets torch scikit-learn




In [3]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)




In [4]:
data = {
    "text": [
        "yellow leaves with curling",
        "brown spots on tomato leaf",
        "white powdery substance on leaf",
        "leaf edges turning black",
        "healthy green leaf"
    ],
    "label": [
        "Tomato Yellow Leaf Curl Virus",
        "Tomato Early Blight",
        "Powdery Mildew",
        "Leaf Blight",
        "Healthy"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text,label
0,yellow leaves with curling,Tomato Yellow Leaf Curl Virus
1,brown spots on tomato leaf,Tomato Early Blight
2,white powdery substance on leaf,Powdery Mildew
3,leaf edges turning black,Leaf Blight
4,healthy green leaf,Healthy


In [5]:
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])

num_labels = len(label_encoder.classes_)
print("Classes:", label_encoder.classes_)


Classes: ['Healthy' 'Leaf Blight' 'Powdery Mildew' 'Tomato Early Blight'
 'Tomato Yellow Leaf Curl Virus']


In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"].tolist(),
    df["label_encoded"].tolist(),
    test_size=0.2,
    random_state=42
)


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=64
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=64
)


In [9]:
class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SymptomDataset(train_encodings, train_labels)
val_dataset = SymptomDataset(val_encodings, val_labels)


In [10]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=num_labels
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
!pip install -U transformers




In [12]:
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)


In [13]:
import transformers
print(transformers.__version__)


4.57.3


In [14]:
import transformers
print("Transformers version:", transformers.__version__)

from transformers import Trainer
from transformers import TrainingArguments
from transformers import BertTokenizer
from transformers import BertForSequenceClassification


Transformers version: 4.57.3


In [15]:
print(TrainingArguments)


<class 'transformers.training_args.TrainingArguments'>


In [18]:
training_args = TrainingArguments(
    output_dir="./bert_results",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    do_train=True,
    do_eval=True,
    report_to="none"
)


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()


  trainer = Trainer(


Step,Training Loss


TrainOutput(global_step=5, training_loss=1.4894232749938965, metrics={'train_runtime': 40.6438, 'train_samples_per_second': 0.492, 'train_steps_per_second': 0.123, 'total_flos': 82224419520.0, 'train_loss': 1.4894232749938965, 'epoch': 5.0})

In [20]:
SAVE_DIR = "/content/drive/MyDrive/PlantDocBot/bert_symptom_model"

model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)

print("BERT model and tokenizer saved")


BERT model and tokenizer saved


In [21]:
def predict_symptom(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    pred = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([pred])[0]

print(predict_symptom("yellow leaves curling and weak plant"))


Tomato Yellow Leaf Curl Virus
