In [1]:
# Install / upgrade dependencies (run this cell FIRST)
!pip install -q "transformers[torch]" "accelerate>=0.26.0" datasets scikit-learn pandas

# If you still see a torch import error, uncomment and run the appropriate torch install for your platform:
# For Linux/CUDA (example): !pip install -q torch --index-url https://download.pytorch.org/whl/cu118
# For CPU-only: !pip install -q torch



In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

print("torch available:", torch.__version__)


  from .autonotebook import tqdm as notebook_tqdm


torch available: 2.8.0+cpu


In [5]:
# Load your dataset (CSV with 'text' and 'label' columns)
df = pd.read_csv("../data/textdata.csv")

# Allowed labels (text form)
allowed_labels = ["streetlight", "garbage", "potholes"]

# Filter only rows with allowed labels and keep text labels in df
df = df[df["label"].isin(allowed_labels)].reset_index(drop=True)

# Create mapping for training (used internally)
label2id = {label: idx for idx, label in enumerate(allowed_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Convert label column to integer IDs for training
df["label_id"] = df["label"].map(label2id)

print("Label mapping:", label2id)
print("Loaded dataset shape:", df.shape)
df.head()

Label mapping: {'streetlight': 0, 'garbage': 1, 'potholes': 2}
Loaded dataset shape: (1267, 3)


Unnamed: 0,text,label,label_id
0,The streetlight at the main intersection is no...,streetlight,0
1,There's a massive pile of garbage near the com...,garbage,1
2,A deep pothole on Station Road is causing traf...,potholes,2
3,Hamari gali ki light pichle 10 din se kharab hai.,streetlight,0
4,Market me har taraf gandagi faili hui hai.,garbage,1


In [11]:
# Drop the 'label_id' column from df
df_no_label_id = df.drop(columns=['label_id'])

# Predict label for a given text using the trained model
def predict_label(text):
  inputs = tokenizer(text, return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits).item()
  return id2label[predicted_class_id]

# Example usage
example_text = "My internet is not working properly"
predicted_label = predict_label(example_text)
print(f"Predicted label: {predicted_label}")

Predicted label: garbage


In [12]:
# Use the integer label column for training (must be named 'label' for HuggingFace Trainer)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df[['text', 'label_id']].rename(columns={'label_id': 'label'}))
test_dataset = Dataset.from_pandas(test_df[['text', 'label_id']].rename(columns={'label_id': 'label'}))

In [13]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


Map: 100%|██████████| 1013/1013 [00:00<00:00, 9520.89 examples/s]
Map: 100%|██████████| 254/254 [00:00<00:00, 6918.01 examples/s]


In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label2id), id2label=id2label, label2id=label2id
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
training_args = TrainingArguments(
    output_dir="../model/results",
    per_device_train_batch_size=4,  # Reduced batch size for faster training
    per_device_eval_batch_size=4,   # Reduced batch size
    num_train_epochs=1,             # Reduced to 1 epoch for faster training
    weight_decay=0.01,
    logging_dir="../model/logs",
    logging_steps=50,               # Increased logging steps
    report_to=None,                 # disable wandb/tensorboard auto logging
    push_to_hub=False,
    dataloader_pin_memory=False,    # Disable pin memory for CPU
    save_steps=500,                 # Save less frequently
    eval_steps=500,                 # Evaluate less frequently
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    processing_class=tokenizer,  # Updated from tokenizer to processing_class
)

trainer.train()

Step,Training Loss
50,0.0745
100,0.0002
150,0.0001
200,0.0049
250,0.0001


TrainOutput(global_step=254, training_loss=0.01570136011447519, metrics={'train_runtime': 311.765, 'train_samples_per_second': 3.249, 'train_steps_per_second': 0.815, 'total_flos': 33547966979328.0, 'train_loss': 0.01570136011447519, 'epoch': 1.0})

In [17]:
model.save_pretrained("../model/saved_model")
tokenizer.save_pretrained("../model/saved_model")

print("✅ Model saved in ../model/saved_model")


✅ Model saved in ../model/saved_model


In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load saved model + tokenizer
model_path = "../model/saved_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Function to predict category
def classify_issue(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class_id = probs.argmax().item()
    predicted_label = model.config.id2label[predicted_class_id]
    return {"text": text, "predicted_category": predicted_label, "confidence": probs[0][predicted_class_id].item()}

# 🔹 Test the classifier
examples = [
    "The streetlight near my home is broken",
    "There is garbage dumped on the roadside",
    "The road has a big pothole causing accidents",
    "road me gadhha hai",
    "streetlight khrab hai",
    "kachra faila hua hai",
    "tuta hua road hai",
    "gadha hai",
    "kachra faila hai road pr"

]

for text in examples:
    print(classify_issue(text))


{'text': 'The streetlight near my home is broken', 'predicted_category': 'streetlight', 'confidence': 0.9999548196792603}
{'text': 'There is garbage dumped on the roadside', 'predicted_category': 'garbage', 'confidence': 0.9999219179153442}
{'text': 'The road has a big pothole causing accidents', 'predicted_category': 'potholes', 'confidence': 0.9999740123748779}
{'text': 'road me gadhha hai', 'predicted_category': 'potholes', 'confidence': 0.9999723434448242}
{'text': 'streetlight khrab hai', 'predicted_category': 'streetlight', 'confidence': 0.9999518394470215}
{'text': 'kachra faila hua hai', 'predicted_category': 'garbage', 'confidence': 0.9999144077301025}
{'text': 'tuta hua road hai', 'predicted_category': 'potholes', 'confidence': 0.9999680519104004}
{'text': 'gadha hai', 'predicted_category': 'potholes', 'confidence': 0.9994914531707764}
{'text': 'kachra faila hai road pr', 'predicted_category': 'garbage', 'confidence': 0.9998955726623535}
{'text': 'tuta hua road hai', 'predict