In [2]:
import pandas as pd
# Load dataset (expects columns: text,label). If this fails, ensure openpyxl is installed for .xlsx files.
df = pd.read_excel("reply_classification_dataset.xlsx")   # columns: text,label

# Quick preview
df.head()

df.shape

(2129, 2)

In [1]:
# GPU check: verify PyTorch sees CUDA and GPUs
import torch
print('PyTorch version:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('CUDA device count:', torch.cuda.device_count())
if torch.cuda.is_available():
    print('CUDA device name:', torch.cuda.get_device_name(0))

PyTorch version: 2.9.0+cpu
CUDA available: False
CUDA device count: 0


In [4]:
df = df.dropna(subset=['reply']).reset_index(drop=True)

df.shape

(2129, 2)

In [5]:
import re
def clean_text(s):
    s = s.strip()
    s = re.sub(r'\s+', ' ', s)
    return s
df['reply'] = df['reply'].astype(str).map(clean_text)


In [6]:
df['label'] = df['label'].str.lower()


In [7]:

label2id = {'negative':0,'neutral':1,'positive':2}
df['label_id'] = df['label'].map(label2id)
df['label'].value_counts(normalize=True)


label
positive    0.33349
negative    0.33349
neutral     0.33302
Name: proportion, dtype: float64

In [8]:
df.head()


Unnamed: 0,reply,label,label_id
0,Can we discuss pricing??,neutral,1
1,"Im excited to explore this further, plz send c...",positive,2
2,We not looking for new solutions.,negative,0
3,Could u clarify features included?,neutral,1
4,"lets,, schedule a meeting to dive deeper",positive,2


In [9]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report

vect = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
clf = LogisticRegression(max_iter=1000, class_weight='balanced')  # use balanced if imbalanced

pipe = make_pipeline(vect, clf)
pipe.fit(train['reply'], train['label_id'])


0,1,2
,steps,"[('tfidfvectorizer', ...), ('logisticregression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [11]:
pred = pipe.predict(val['reply'])
acc = accuracy_score(val['label_id'], pred)
f1 = f1_score(val['label_id'], pred, average='macro')
print(acc, f1)
print(classification_report(val['label_id'], pred, target_names=label2id.keys()))


0.9976525821596244 0.997652553055194
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       142
     neutral       1.00      0.99      1.00       142
    positive       0.99      1.00      1.00       142

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [12]:
import joblib
joblib.dump(pipe, "models/baseline_model.pkl")


['models/baseline_model.pkl']

In [13]:
# Transformer fine-tune

from datasets import Dataset
train_ds = Dataset.from_pandas(train[['reply','label_id']].rename(columns={'label_id':'label'}))
val_ds = Dataset.from_pandas(val[['reply','label_id']].rename(columns={'label_id':'label'}))


In [14]:
from transformers import AutoTokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['reply'], truncation=True, padding='max_length', max_length=128)
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
columns = ['input_ids','attention_mask','label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)


Map:   0%|          | 0/1703 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

training_args = TrainingArguments(
    output_dir="hf_model",
    eval_strategy="epoch",
    save_strategy="epoch",           # ensure save and eval strategies match
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=True,                        # enable mixed precision if supported
    gradient_checkpointing=True,      # reduce memory usage at cost of speed
)

import numpy as np
from evaluate import load
accuracy_metric = load("accuracy")
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model("models/hf_model")
tokenizer.save_pretrained("models/hf_model")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 