# File for issue classification

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd

In [None]:
def load_data(file_path):
    """
    Load and preprocess the dataset.
    The dataset is expected to have 'text' and 'label' columns.
    """
    data = pd.read_csv(file_path)  # Replace with the appropriate file type if not CSV
    data = data.dropna(subset=['text', 'label'])  # Drop rows with missing text or label
    return Dataset.from_pandas(data)

In [None]:
def tokenize_data(dataset, tokenizer):
    """
    Tokenizes the dataset using the provided tokenizer.
    """
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)

    return dataset.map(tokenize_function, batched=True)

In [None]:
def load_model_and_tokenizer(model_name):
    """
    Loads the tokenizer and model from Hugging Face.
    """
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    return tokenizer, model

In [None]:
def train_model(tokenized_datasets, model, tokenizer, output_dir):
    """
    Trains the BERT model using the tokenized datasets.
    """
    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        load_best_model_at_end=True,
        logging_dir=f"{output_dir}/logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
    )

    trainer.train()
    return trainer

In [None]:
# Step 1: Load dataset
dataset = load_data(dataset_path)
dataset = dataset.train_test_split(test_size=0.2)

# Step 2: Load tokenizer and model
tokenizer, model = load_model_and_tokenizer(model_name)

# Step 3: Tokenize dataset
tokenized_datasets = tokenize_data(dataset, tokenizer)

# Step 4: Train the model
trainer = train_model(tokenized_datasets, model, tokenizer, output_dir)

# Save the fine-tuned model
trainer.save_model(output_dir)
print("Model training complete and saved.")