In [1]:
!pip install sagemaker transformers datasets boto3 --upgrade



In [2]:
!pip install torch tiktoken --upgrade



In [3]:
!pip install transformers tokenizers sentencepiece --upgrade



In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "microsoft/deberta-v3-base"

# Force using the slow tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

print("Model and tokenizer loaded successfully!")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully!


In [5]:
import sagemaker

# Get SageMaker's default S3 bucket
session = sagemaker.Session()
s3_bucket = session.default_bucket()  # SageMaker auto-created bucket

print(f"Your S3 Bucket: {s3_bucket}")




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Your S3 Bucket: sagemaker-us-east-1-198739141498


In [6]:
import boto3

s3_client = boto3.client("s3")
s3_path = f"preprocessed_data_level1.csv"  # Adjust if your path is different

# List files in S3
response = s3_client.list_objects_v2(Bucket=s3_bucket, Prefix=s3_path)

# Print found files
for obj in response.get("Contents", []):
    print("Found:", obj["Key"])


Found: preprocessed_data_level1.csv


In [7]:
import boto3

s3_script_path = f"scripts/train.py"
s3_client.upload_file("train.py", s3_bucket, s3_script_path)

print(f"Training script uploaded to s3://{s3_bucket}/{s3_script_path}")


Training script uploaded to s3://sagemaker-us-east-1-198739141498/scripts/train.py


In [8]:
!pip install transformers datasets torch



In [9]:
import boto3

s3_client = boto3.client("s3")

# List all files in your bucket
response = s3_client.list_objects_v2(Bucket=s3_bucket)

if "Contents" in response:
    for obj in response["Contents"]:
        print("Found file:", obj["Key"])
else:
    print("No files found in the bucket.")


Found file: preprocessed_data_level1.csv
Found file: scripts/train.py


In [10]:
local_file_path = "train.csv"  # This will save the file in your current directory

s3_client.download_file(s3_bucket, "preprocessed_data_level1.csv", local_file_path)

print(f"Downloaded train.csv from s3://{s3_bucket}/classification/train.csv")

Downloaded train.csv from s3://sagemaker-us-east-1-198739141498/classification/train.csv


In [11]:
!pip install --upgrade accelerate transformers[torch] torch



In [17]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# Load dataset
dataset = load_dataset("csv", data_files={"train": "train.csv"})["train"]

# Split into 80% train, 20% validation
dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# Define label mappings
label_list = ["Strom", "Sonstiges", "Gas"]
num_labels = len(label_list)
label_to_id = {label: i for i, label in enumerate(label_list)}

# Function to replace string labels with numerical IDs
def encode_labels(example):
    if "labels" not in example or example["labels"] is None:
        return example  # Skip rows with missing labels
    example["labels"] = label_to_id.get(example["labels"], -1)  # Default -1 if label is unknown
    return example

# Apply the function to both train and test sets
train_dataset = train_dataset.map(encode_labels)
eval_dataset = eval_dataset.map(encode_labels)

# Load model & tokenizer
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


def remove_none_labels(example):
    return example["labels"] is not None  # Keep only non-None labels

print(len(train_dataset))
train_dataset = train_dataset.filter(remove_none_labels)
eval_dataset = eval_dataset.filter(remove_none_labels)
print(len(train_dataset))


# Tokenize datasets separately to preserve structure
def tokenize_function(examples):
    result = tokenizer(
        examples["text"],
        padding="max_length",   # Ensures all inputs are same length
        truncation=True,        # Truncate long texts
        max_length=197,         # Ensure fixed input size
    )
    
    # ✅ Preserve the labels in the output dataset
    if "labels" in examples:
        result["labels"] = examples["labels"]

    return result

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./model_output",
    evaluation_strategy="epoch",
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
)

# Custom loss function to handle missing labels
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")  # Extract labels
        outputs = model(**inputs)  # Forward pass
        logits = outputs.logits  # Get logits

        # Compute loss manually
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


# Train model locally
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # ✅ Corrected
    eval_dataset=eval_dataset,    # ✅ Corrected
)

trainer.train()

# Save model
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

print("Training completed!")


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


576
574


Epoch,Training Loss,Validation Loss
1,No log,0.941954
2,No log,0.91025
3,No log,0.86158


Training completed!


In [18]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Define model path (same as where you saved it)
model_path = "./trained_model"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): Dropout(p=0.1, inplace=False)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Layer

In [19]:
def predict(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Move to GPU if available
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert logits to probabilities
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1).squeeze().tolist()
    
    # Get predicted label
    predicted_label = torch.argmax(logits, dim=-1).item()
    
    return predicted_label, probabilities

In [20]:
text_example = "Von: falk schuster < Email_ersetzt > Gesendet: Mittwoch,  Datum_ersetzt  07:55 An:  Email_ersetzt  Cc: Poststelle < Email_ersetzt > Betreff: Re: Neues Dokument im Postfach: Neue Strompreise ab  Datum_ersetzt , Zählernr.: 1ITR Nummer_ersetzt , Portal ID:  Nummer_ersetzt  Priorität: Hoch  Der angebotene Preis mit über 36 cent je kwh ist ein trauriges Angebot der wemag und ist weit entfernt von der aktuellen Situation am Strommarkt. Man kann nur hoffen, das die ORG hier eingreift und diese Preise reguliert werden. [cid: Email_ersetzt ] [cid: Email_ersetzt ] [cid: Email_ersetzt ]  Am  Datum_ersetzt  um 03:07 schrieb  Email_ersetzt <mailto: Email_ersetzt >: Lieber ORG-Kunde,  in Ihrem Postfach befindet sich ein neues Dokument. Das Dokument enthält eine Mitteilung zur Preisänderung zum  Datum_ersetzt . Melden Sie sich gleich mit Ihrer E-Mail-Adresse und Passwort unter  Link_ersetzt  an, um Ihre Mitteilung abzurufen.  Haben Sie Ihr Passwort vergessen? Dann fordern Sie unter  Link_ersetzt / ganz einfach ein neues Passwort an.  Sie möchten Ihren Abschlag anpassen, uns Ihren Zählerstand mitteilen oder Ihre Bankverbindung ändern? Oder Sie möchten einen weiteren Strom- oder Gasvertrag anlegen? Das können Sie selbst ganz schnell mit nur wenigen Klicks in Ihrem Kundenportal über die Selfservice-Funktionen erledigen. Auch ein Umzug ist im Kundenportal kein Problem. Probieren Sie es aus!   Freundliche Grüße  Ihr Service-Team der ORG  Webseite:  Link_ersetzt > E-Mail:  Email_ersetzt <mailto: Email_ersetzt > Hausadresse:  Straße_ersetzt ,  Nummer_ersetzt  LOC Impressum:  Link_ersetzt."
predicted_label, probabilities = predict(text_example)

print("Predicted Label:", predicted_label)
print("Confidence Scores:", probabilities)

Predicted Label: 0
Confidence Scores: [0.7387288808822632, 0.21501390635967255, 0.046257227659225464]


In [23]:
import boto3
import os

# Define S3 bucket & model path
s3_bucket_name = "sagemaker-us-east-1-198739141498"
s3_folder = "deberta-finetuned-model/"  # S3 folder (adjust as needed)
local_model_path = "./trained_model/"  # Local model folder

# Create S3 client
s3 = boto3.client("s3")

# Upload all files in the directory
for root, dirs, files in os.walk(local_model_path):
    for file in files:
        local_file_path = os.path.join(root, file)
        s3_key = s3_folder + file  # S3 key (path in S3)
        
        # Upload file to S3
        s3.upload_file(local_file_path, s3_bucket_name, s3_key)
        print(f"Uploaded {file} to s3://{s3_bucket_name}/{s3_key}")

print("✅ Upload completed!")

Uploaded added_tokens.json to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/added_tokens.json
Uploaded special_tokens_map.json to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/special_tokens_map.json
Uploaded spm.model to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/spm.model
Uploaded tokenizer_config.json to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/tokenizer_config.json
Uploaded training_args.bin to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/training_args.bin
Uploaded config.json to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/config.json
Uploaded model.safetensors to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/model.safetensors
Uploaded tokenizer.json to s3://sagemaker-us-east-1-198739141498/deberta-finetuned-model/tokenizer.json
✅ Upload completed!
