In [1]:
# Run in a Colab cell (prepend `!`)
!pip install -q transformers datasets evaluate scikit-learn lightgbm joblib

In [2]:
from google.colab import files
import pandas as pd

uploaded = files.upload()  # click to upload reply_classification_dataset.csv
df = pd.read_csv(next(iter(uploaded.keys())))

Saving reply_classification_dataset.csv to reply_classification_dataset (1).csv


In [3]:
df.head(10)
df.shape
df.info()
# Check class distribution
print(df['label'].value_counts())   # change 'label' to your label column name
# Check missing values & duplicates
print(df.isnull().sum())
print("duplicates:", df.duplicated(subset=['reply']).sum())  # change 'reply' to text col

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2129 entries, 0 to 2128
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   reply   2129 non-null   object
 1   label   2129 non-null   object
dtypes: object(2)
memory usage: 33.4+ KB
label
neutral     704
positive    446
NEGATIVE    267
POSITIVE    263
Negative    254
negative    189
Neutral       3
NEUTRAL       2
Positive      1
Name: count, dtype: int64
reply    0
label    0
dtype: int64
duplicates: 1808


In [4]:
import re
import numpy as np
import pandas as pd

TEXT_COL = 'reply'   # replace with your text column
LABEL_COL = 'label'  # replace if different

def clean_text(s):
    if pd.isna(s): return ""
    s = str(s).lower()
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    s = re.sub(r'\S+@\S+', ' ', s)
    s = re.sub(r'\d+', ' ', s)
    s = re.sub(r'[^a-z\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

df['text_clean'] = df[TEXT_COL].apply(clean_text)
df[LABEL_COL] = df[LABEL_COL].astype(str).str.lower().str.strip()
df = df[df['text_clean'].str.len() > 0]  # drop empty

In [5]:
from sklearn.model_selection import train_test_split
labels = sorted(df[LABEL_COL].unique())
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}
df['label_id'] = df[LABEL_COL].map(label2id)

# Stratified split: train (80%), test (20%) then split train->train+val if desired
X = df['text_clean'].values
y = df['label_id'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Optional: get a validation set from train
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, stratify=y_train, random_state=42
)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import joblib

tf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=2)
Xtr_tfidf = tf.fit_transform(X_train)
Xval_tfidf = tf.transform(X_val)
Xtest_tfidf = tf.transform(X_test)

lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(Xtr_tfidf, y_train)

# Evaluate on validation and test
for name, X_, y_ in [('val', Xval_tfidf, y_val), ('test', Xtest_tfidf, y_test)]:
    preds = lr.predict(X_)
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y_, preds))
    print("Macro F1:", f1_score(y_, preds, average='macro'))
    print(classification_report(y_, preds, target_names=[id2label[i] for i in sorted(id2label)]))

# Save artifacts
joblib.dump(tf, 'tfidf_vectorizer.joblib')
joblib.dump(lr, 'logistic_regression_tfidf.joblib')


=== val ===
Accuracy: 0.9941520467836257
Macro F1: 0.9941515967679878
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00        57
     neutral       1.00      0.98      0.99        57
    positive       0.98      1.00      0.99        57

    accuracy                           0.99       171
   macro avg       0.99      0.99      0.99       171
weighted avg       0.99      0.99      0.99       171

=== test ===
Accuracy: 0.9976525821596244
Macro F1: 0.997652553055194
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       142
     neutral       1.00      0.99      1.00       142
    positive       0.99      1.00      1.00       142

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



['logistic_regression_tfidf.joblib']

In [7]:
pip install -U lightgbm



In [8]:
!pip install -U lightgbm
import lightgbm as lgb
print(lgb.__version__)

4.6.0


In [9]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import accuracy_score, f1_score

# Create datasets
dtrain = lgb.Dataset(Xtr_tfidf, label=y_train)
dval = lgb.Dataset(Xval_tfidf, label=y_val, reference=dtrain)

# Parameters
params = {
    'objective': 'multiclass',
    'num_class': len(labels),
    'metric': 'multi_logloss',
    'verbosity': -1,
    'learning_rate': 0.05
}

# Train with callbacks instead of early_stopping_rounds
bst = lgb.train(
    params,
    dtrain,
    valid_sets=[dval],
    num_boost_round=1000,
    callbacks=[
        early_stopping(stopping_rounds=30),
        log_evaluation(period=50)  # optional, logs every 50 rounds
    ]
)

# Predictions
y_pred_prob = bst.predict(Xtest_tfidf)
y_pred = y_pred_prob.argmax(axis=1)

# Metrics
print("LGB test acc:", accuracy_score(y_test, y_pred))
print("LGB test macro F1:", f1_score(y_test, y_pred, average='macro'))

# Save model
bst.save_model('lightgbm_tfidf.txt')


Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0603089
[100]	valid_0's multi_logloss: 0.0161705
[150]	valid_0's multi_logloss: 0.014838
Early stopping, best iteration is:
[138]	valid_0's multi_logloss: 0.0142475
LGB test acc: 0.9859154929577465
LGB test macro F1: 0.9859153474355941


<lightgbm.basic.Booster at 0x7d7396bbb290>

In [10]:
!pip install evaluate



In [11]:
import evaluate

# Load metric
metric = evaluate.load("accuracy")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

In [13]:
#!pip install --upgrade transformers

In [14]:
evaluation_strategy='epoch'
save_strategy='epoch'
load_best_model_at_end=True
metric_for_best_model='f1_macro'

In [15]:
#pip install --upgrade transformers datasets evaluate

In [16]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# -------------------------------
# 2️⃣ Prepare model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# -------------------------------
# 3️⃣ Prepare Hugging Face datasets
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
val_df   = pd.DataFrame({'text': X_val,   'label': y_val})
test_df  = pd.DataFrame({'text': X_test,  'label': y_test})

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)
test_ds  = Dataset.from_pandas(test_df)

# Tokenization function
def tokenize_fn(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

# Apply tokenization
train_ds = train_ds.map(tokenize_fn, batched=True)
val_ds   = val_ds.map(tokenize_fn, batched=True)
test_ds  = test_ds.map(tokenize_fn, batched=True)

# Set format for PyTorch
columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)
test_ds.set_format(type='torch', columns=columns)

# -------------------------------
# 4️⃣ Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

# -------------------------------
# 5️⃣ Metrics using evaluate
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
    f1 = f1_metric.compute(predictions=preds, references=labels, average='macro')['f1']
    return {'accuracy': acc, 'f1_macro': f1}

# -------------------------------
# 6️⃣ Training arguments
training_args = TrainingArguments(
    output_dir='./distilbert_finetuned',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=5,
    logging_steps=50,
    save_total_limit=2,
    do_train=True,   # enable training
    do_eval=True     # enable evaluation
)

# -------------------------------
# 7️⃣ Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# -------------------------------
# 8️⃣ Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# -------------------------------
# 9️⃣ Train the model
trainer.train()

# -------------------------------
# 🔟 Save model and tokenizer
trainer.save_model('./distilbert_finetuned')
tokenizer.save_pretrained('./distilbert_finetuned')

# -------------------------------
# 1️⃣1️⃣ Evaluate on test set
test_metrics = trainer.evaluate(test_ds)
print("Test set metrics:", test_metrics)


Map:   0%|          | 0/1532 [00:00<?, ? examples/s]

Map:   0%|          | 0/171 [00:00<?, ? examples/s]

Map:   0%|          | 0/426 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrakeshkumarr251203[0m ([33mrakeshkumarr251203-company[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.7233
100,0.1245
150,0.0151
200,0.0065
250,0.0042
300,0.003
350,0.0302
400,0.0184
450,0.0018
500,0.0015


Test set metrics: {'eval_loss': 0.0009341947152279317, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_runtime': 1.5829, 'eval_samples_per_second': 269.133, 'eval_steps_per_second': 17.058, 'epoch': 5.0}


In [17]:
# Baseline LR on test
lr_preds = lr.predict(Xtest_tfidf)
from sklearn.metrics import classification_report
print("LR report:")
print(classification_report(y_test, lr_preds, target_names=[id2label[i] for i in sorted(id2label)]))

# Transformer test predictions using Trainer.predict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

model_tf = AutoModelForSequenceClassification.from_pretrained('./distilbert_finetuned')
tokenizer = AutoTokenizer.from_pretrained('./distilbert_finetuned')

def predict_transformer(texts):
    enc = tokenizer(texts, truncation=True, padding=True, return_tensors='pt')
    if torch.cuda.is_available():
        model_tf.cuda()
        enc = {k:v.cuda() for k,v in enc.items()}
    with torch.no_grad():
        out = model_tf(**enc)
        probs = torch.nn.functional.softmax(out.logits, dim=-1).cpu().numpy()
    preds = probs.argmax(axis=1)
    return preds, probs

preds_tf, probs_tf = predict_transformer(list(X_test))
print("Transformer report:")
print(classification_report(y_test, preds_tf, target_names=[id2label[i] for i in sorted(id2label)]))


LR report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       142
     neutral       1.00      0.99      1.00       142
    positive       0.99      1.00      1.00       142

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426

Transformer report:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00       142
     neutral       1.00      1.00      1.00       142
    positive       1.00      1.00      1.00       142

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426



In [18]:
from google.colab import drive

# Mount to a different empty folder
drive.mount('/content/my_drive')

Mounted at /content/my_drive


In [19]:
import os
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from google.colab import drive

# 1️⃣ Mount Google Drive
drive.mount('/content/drive')

# 2️⃣ Define paths
save_folder = '/content/drive/MyDrive/svaraai/'
tfidf_path = os.path.join(save_folder, 'tfidf_vectorizer.joblib')
lr_path    = os.path.join(save_folder, 'logistic_regression_tfidf.joblib')
distilbert_src = './distilbert_finetuned'
distilbert_dst = os.path.join(save_folder, 'distilbert_finetuned')

# 3️⃣ Create folder if it doesn't exist
os.makedirs(save_folder, exist_ok=True)

# 4️⃣ Example: create TF-IDF + Logistic Regression (replace with your trained models)
tf = TfidfVectorizer()
lr = LogisticRegression()

# If you have training data, fit them first:
# tfidf_matrix = tf.fit_transform(X_train)
# lr.fit(tfidf_matrix, y_train)

# 5️⃣ Save the TF-IDF vectorizer and Logistic Regression
joblib.dump(tf, tfidf_path)
joblib.dump(lr, lr_path)

# 6️⃣ Copy the distilbert_finetuned folder to Google Drive
!cp -r "{distilbert_src}" "{distilbert_dst}"

print("✅ Models and transformer saved successfully!")


Mounted at /content/drive
✅ Models and transformer saved successfully!


In [20]:
import random, numpy as np, torch
random.seed(42); np.random.seed(42); torch.manual_seed(42)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(42)