In [1]:
# from transformers import DistilBertTokenizer
# from transformers import TFDistilBertForSequenceClassification
# from transformers import TextClassificationPipeline

# import pandas as pd
# import json
# import gc

# from sklearn.model_selection import train_test_split

# import re
# import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stopw = stopwords.words('english')

# import seaborn as sns
# import matplotlib.pyplot as plt
# from plotly.offline import iplot

# from tqdm import tqdm

In [48]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [49]:
df = pd.read_csv("college_queries.csv")


if df["labels"].dtype == 'object':
    unique_labels = sorted(df["labels"].unique())
    label2id = {label: i for i, label in enumerate(unique_labels)}
    id2label = {i: label for label, i in label2id.items()}
    df["labels"] = df["labels"].map(label2id)
else:
    # If already numeric
    label2id = None
    id2label = None

In [50]:
# Train / validation split
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df, test_size=0.15, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_pandas(train_df.reset_index(drop=True)),
    "validation": Dataset.from_pandas(valid_df.reset_index(drop=True)),
})



In [51]:

# 3. Tokenizer
from transformers import AutoTokenizer
model_ckpt = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

def tokenize(batch):
    return tokenizer(batch["Queries"], padding=True, truncation=True)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["Queries"])
tokenized_dataset.set_format("torch")

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

In [52]:
# 4. Data Loaders
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    tokenized_dataset["train"],
    batch_size=32,
    shuffle=True,
    collate_fn=data_collator
)

valid_dataloader = DataLoader(
    tokenized_dataset["validation"],
    batch_size=32,
    shuffle=False,
    collate_fn=data_collator
)


In [53]:
# 5. Load Model
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = df["labels"].nunique()

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [54]:
# 6. Setup Optimizer and Loss
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [55]:
# 7. Training Loop

from tqdm.auto import tqdm

num_epochs = 5

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    train_progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
    for batch in train_progress:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy - FIXED: "Labels" -> "labels"
        predictions = torch.argmax(outputs.logits, dim=-1)
        train_correct += (predictions == batch["labels"]).sum().item()
        train_total += batch["labels"].size(0)
        train_loss += loss.item()
        
        train_progress.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{train_correct/train_total:.4f}'
        })
    
    avg_train_loss = train_loss / len(train_dataloader)
    train_accuracy = train_correct / train_total
    
    # Validation
    model.eval()
    valid_loss = 0
    valid_correct = 0
    valid_total = 0
    
    with torch.no_grad():
        valid_progress = tqdm(valid_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Valid]")
        for batch in valid_progress:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            
            # Calculate accuracy - FIXED: "Labels" -> "labels"
            predictions = torch.argmax(outputs.logits, dim=-1)
            valid_correct += (predictions == batch["labels"]).sum().item()
            valid_total += batch["labels"].size(0)
            valid_loss += loss.item()
            
            valid_progress.set_postfix({
                'loss': f'{loss.item():.4f}',
                'acc': f'{valid_correct/valid_total:.4f}'
            })
    
    avg_valid_loss = valid_loss / len(valid_dataloader)
    valid_accuracy = valid_correct / valid_total
    
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_accuracy:.4f}")
    print(f"Valid Loss: {avg_valid_loss:.4f} | Valid Acc: {valid_accuracy:.4f}\n")

Epoch 1/5 [Train]:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5 [Valid]:   0%|          | 0/1 [00:00<?, ?it/s]


Epoch 1/5
Train Loss: 2.1205 | Train Acc: 0.2293
Valid Loss: 1.9643 | Valid Acc: 0.3571



Epoch 2/5 [Train]:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 2/5 [Valid]:   0%|          | 0/1 [00:00<?, ?it/s]


Epoch 2/5
Train Loss: 1.7916 | Train Acc: 0.4331
Valid Loss: 1.8358 | Valid Acc: 0.3571



Epoch 3/5 [Train]:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 3/5 [Valid]:   0%|          | 0/1 [00:00<?, ?it/s]


Epoch 3/5
Train Loss: 1.5692 | Train Acc: 0.4331
Valid Loss: 1.6370 | Valid Acc: 0.3571



Epoch 4/5 [Train]:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 4/5 [Valid]:   0%|          | 0/1 [00:00<?, ?it/s]


Epoch 4/5
Train Loss: 1.3597 | Train Acc: 0.4586
Valid Loss: 1.4301 | Valid Acc: 0.4286



Epoch 5/5 [Train]:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 5/5 [Valid]:   0%|          | 0/1 [00:00<?, ?it/s]


Epoch 5/5
Train Loss: 1.1053 | Train Acc: 0.6433
Valid Loss: 1.1749 | Valid Acc: 0.7500



In [56]:
# 8. Save Model (Optional)
# ---------------------------------------------------------
model.save_pretrained("./trained_model")
tokenizer.save_pretrained("./trained_model")

('./trained_model\\tokenizer_config.json',
 './trained_model\\special_tokens_map.json',
 './trained_model\\vocab.txt',
 './trained_model\\added_tokens.json',
 './trained_model\\tokenizer.json')

In [57]:
# 9. Test Prediction
# ---------------------------------------------------------
def predict(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        label_id = torch.argmax(probs, dim=-1).item()
    
    return id2label[label_id]

In [58]:
# Test predictions
print("Testing predictions:")
print(f"'Where is my attendance?' -> {predict('Where is my attendance?')}")
print(f"'How do I check teacher details?' -> {predict('How do I check teacher details?')}")

Testing predictions:
'Where is my attendance?' -> Attendance
'How do I check teacher details?' -> College Info
