In [2]:
# ✅ Step 1: Install required libraries
!pip install -U transformers datasets scikit-learn

# ✅ Step 2: Import Libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset
import os

# ✅ Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

# ✅ Step 3: Load dataset
df = pd.read_csv("test.csv", encoding="latin1")
df = df[['text', 'sentiment']].dropna()
df = df[df['sentiment'].isin(['positive', 'negative', 'neutral'])]

# ✅ Step 4: Encode labels
label2id = {label: idx for idx, label in enumerate(sorted(df['sentiment'].unique()))}
id2label = {idx: label for label, idx in label2id.items()}
df['label'] = df['sentiment'].map(label2id)

# ✅ Step 5: Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# ✅ Step 6: Tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

# ✅ Step 7: Convert to Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'label': train_labels
})
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'label': test_labels
})

# ✅ Step 8: Load model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# ✅ Step 9: Training arguments (safe, minimal)
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=10
)

# ✅ Step 10: Trainer setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ✅ Step 11: Train the model
trainer.train()

# ✅ Step 12: Evaluate
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)

print("\n✅ Accuracy:", accuracy_score(test_labels, pred_labels))
print("\n📊 Classification Report:\n", classification_report(test_labels, pred_labels, target_names=label2id.keys()))




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(
  return forward_call(*args, **kwargs)


Step,Training Loss
10,1.1215
20,1.0574
30,1.0353
40,0.9811
50,0.9004
60,0.8444
70,0.795
80,0.7609
90,0.7491
100,0.6925


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



✅ Accuracy: 0.7694483734087695

📊 Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.81      0.79       207
     neutral       0.75      0.73      0.74       286
    positive       0.79      0.78      0.79       214

    accuracy                           0.77       707
   macro avg       0.77      0.77      0.77       707
weighted avg       0.77      0.77      0.77       707

