In [None]:
# ✅ STEP 1: Install Hugging Face Transformers
!pip install -q transformers

# ✅ STEP 2: Imports
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

import os
os.environ["WANDB_DISABLED"] = "true"


# ✅ STEP 3: Load Data
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
train_df = train_df[['text', 'target']]

# ✅ STEP 4: Tokenization & Dataset Prep
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TweetDataset(Dataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# ✅ STEP 5: Train / Validation Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['text'].tolist(), train_df['target'].tolist(), test_size=0.2)

train_dataset = TweetDataset(train_texts, train_labels)
val_dataset = TweetDataset(val_texts, val_labels)

# ✅ STEP 6: Load Pretrained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# ✅ STEP 7: Training Arguments
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# ✅ STEP 9: Train
trainer.train()

# ✅ STEP 10: Predict on Test Set
test_dataset = TweetDataset(test_df['text'].tolist())
preds = trainer.predict(test_dataset).predictions
final_preds = torch.argmax(torch.tensor(preds), dim=1)

# ✅ STEP 11: Submission
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
submission['target'] = final_preds.numpy()
submission.to_csv("submission.csv", index=False)