In [None]:
#---------------------------------------------
# NLP Exploration: Sentence Classification with BERT
#---------------------------------------------

# Install required packages (uncomment if needed)
# !pip install transformers datasets torch sklearn

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ----------------- DATA -----------------
sentences = [
    "I love reading books on data science.",
    "The weather today is sunny and warm.",
    "Python is a great programming language for AI.",
    "I need to buy groceries after work.",
    "The new movie was fantastic!",
    "Machine learning models require careful tuning.",
    "My car broke down on the way home.",
    "Data visualization helps understand trends.",
    "I enjoy hiking during the weekends.",
    "Artificial intelligence is transforming industries."
]

labels = [
    "Tech", "Non-Tech", "Tech", "Non-Tech", "Non-Tech",
    "Tech", "Non-Tech", "Tech", "Non-Tech", "Tech"
]

# Encode labels to integers
le = LabelEncoder()
y = le.fit_transform(labels)

# Split dataset
train_sentences, test_sentences, train_labels, test_labels = train_test_split(
    sentences, y, test_size=0.3, random_state=42, stratify=y
)

# ----------------- TOKENIZATION -----------------
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=64)

# Create Hugging Face Datasets
train_dataset = Dataset.from_dict({"text": train_sentences, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_sentences, "label": test_labels})
dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

dataset = dataset.map(tokenize, batched=True)
dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

# ----------------- MODEL -----------------
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(le.classes_)
)

# ----------------- METRICS -----------------
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ----------------- TRAINING -----------------
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=5,
    log_level="error"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics
)

trainer.train()

# ----------------- PREDICTION -----------------
new_sentences = [
    "I am learning neural networks.",
    "It's raining heavily today."
]

new_encodings = tokenizer(new_sentences, padding=True, truncation=True, return_tensors='pt', max_length=64)
with torch.no_grad():
    outputs = model(**new_encodings)
    predictions = torch.argmax(outputs.logits, dim=-1)

for sent, pred in zip(new_sentences, predictions):
    print(f"Sentence: '{sent}' -> Predicted Label: {le.inverse_transform([pred.item()])[0]}")
