In [1]:
import pandas as pd
import re
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback

# Load the data
df = pd.read_csv("scrapped_data.csv")

# Strip whitespace from 'text' column and drop empty rows
df['text'] = df['text'].str.strip()
df = df[df['text'] != ""]

# Preprocess text function
def preprocess_text(text):
    if isinstance(text, list):
        text = ' '.join(text)
    processed_text = re.sub('[^a-zA-Z]', ' ', text)
    processed_text = processed_text.lower().strip()
    return processed_text

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# VADER Sentiment Analysis (for label generation)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Create sentiment labels
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['opinion'] = df['text'].apply(get_sentiment)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['opinion'], test_size=0.2)

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize data with maximum length
max_length = 512
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

# Convert labels to numerical format
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],
    optimizers=(AdamW(model.parameters(), lr=2e-5), None)
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Save the trained model and tokenizer
model.save_pretrained('./output_dir')
tokenizer.save_pretrained('./output_dir')

# Save model state
torch.save(model.state_dict(), './output_dir/fine_tuned_model.pth')
print("Trained model saved at:", './output_dir')



config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.4103,0.676465


Evaluation results: {'eval_loss': 0.6764651536941528, 'eval_runtime': 13.6129, 'eval_samples_per_second': 27.547, 'eval_steps_per_second': 3.453, 'epoch': 3.0}
Trained model saved at: ./output_dir


In [14]:
import pandas as pd
import re
import torch
import math
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

# Load the data
df = pd.read_csv("scrapped_data.csv")

# Strip whitespace from 'text' column and drop empty rows
df['text'] = df['text'].str.strip()
# Drop rows where 'text' column is empty
df.drop(df[df['text'] == ""].index, inplace=True)

def preprocess_text(text):
    if isinstance(text, float) and math.isnan(text):
        return ""  # Return empty string for NaN values
    if isinstance(text, list):
        # Convert the list to a string
        text = ' '.join(text)
    # Remove non-alphabetic characters using regular expression
    processed_text = re.sub('[^a-zA-Z]', ' ', str(text))  # Convert to string before applying regex
    return processed_text

# Apply the preprocessing function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)
#df.head()

# Lowercasing
df['text'] = df['text'].str.lower()
#df.head()

# VADER Sentiment Analysis (for label generation)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Create sentiment labels
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['opinion'] = df['text'].apply(get_sentiment)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['opinion'], test_size=0.2, stratify=df['opinion'])

# Convert labels to numerical format
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

# Oversample the minority class in the training set
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(pd.DataFrame({'text': train_texts}), pd.DataFrame({'label': train_labels}))

# Convert back to lists
train_texts_resampled = train_texts_resampled['text'].tolist()
train_labels_resampled = train_labels_resampled['label'].tolist()

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize data with maximum length
max_length = 512
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels_resampled)
val_dataset = CustomDataset(val_encodings, val_labels)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=train_labels_resampled)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Define custom Trainer class to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],
    optimizers=(AdamW(model.parameters(), lr=2e-5), None)
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Save the trained model and tokenizer
model.save_pretrained('./output_dir')
tokenizer.save_pretrained('./output_dir')

# Save model state
torch.save(model.state_dict(), './output_dir/fine_tuned_model.pth')
print("Trained model saved at:", './output_dir')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.4083,0.846928
1000,0.1189,1.164355
1500,0.0291,1.354107
2000,0.0339,1.440637


Evaluation results: {'eval_loss': 0.8469275832176208, 'eval_runtime': 18.1105, 'eval_samples_per_second': 26.78, 'eval_steps_per_second': 3.368, 'epoch': 4.866180048661801}
Trained model saved at: ./output_dir


In [1]:
import pandas as pd
import re
import torch
import math
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

# Load the data
df = pd.read_csv("scrapped_data.csv")

# Strip whitespace from 'text' column and drop empty rows
df['text'] = df['text'].str.strip()
# Drop rows where 'text' column is empty
df.drop(df[df['text'] == ""].index, inplace=True)

def preprocess_text(text):
    if isinstance(text, float) and math.isnan(text):
        return ""  # Return empty string for NaN values
    if isinstance(text, list):
        # Convert the list to a string
        text = ' '.join(text)
    # Remove non-alphabetic characters using regular expression
    processed_text = re.sub('[^a-zA-Z]', ' ', str(text))  # Convert to string before applying regex
    return processed_text

# Apply the preprocessing function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)
#df.head()

# Lowercasing
df['text'] = df['text'].str.lower()
#df.head()

# VADER Sentiment Analysis (for label generation)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Create sentiment labels
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['opinion'] = df['text'].apply(get_sentiment)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['opinion'], test_size=0.2, stratify=df['opinion'])

# Convert labels to numerical format
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

# Oversample the minority class in the training set
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(pd.DataFrame({'text': train_texts}), pd.DataFrame({'label': train_labels}))

# Convert back to lists
train_texts_resampled = train_texts_resampled['text'].tolist()
train_labels_resampled = train_labels_resampled['label'].tolist()

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize data with maximum length
max_length = 512
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels_resampled)
val_dataset = CustomDataset(val_encodings, val_labels)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=train_labels_resampled)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    evaluation_strategy="steps",
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    learning_rate=1e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Define custom Trainer class to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],
    optimizers=(AdamW(model.parameters(), lr=1e-5), None)
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Save the trained model and tokenizer
model.save_pretrained('./output_dir')
tokenizer.save_pretrained('./output_dir')

# Save model state
torch.save(model.state_dict(), './output_dir/fine_tuned_model.pth')
print("Trained model saved at:", './output_dir')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
500,0.6197,0.715538


Evaluation results: {'eval_loss': 0.7155382037162781, 'eval_runtime': 17.6042, 'eval_samples_per_second': 27.55, 'eval_steps_per_second': 3.465, 'epoch': 2.0}
Trained model saved at: ./output_dir


In [1]:
import pandas as pd
import re
import torch
import math
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

# Load the data
df = pd.read_csv("scrapped_data.csv")

# Strip whitespace from 'text' column and drop empty rows
df['text'] = df['text'].str.strip()
# Drop rows where 'text' column is empty
df.drop(df[df['text'] == ""].index, inplace=True)

def preprocess_text(text):
    if isinstance(text, float) and math.isnan(text):
        return ""  # Return empty string for NaN values
    if isinstance(text, list):
        # Convert the list to a string
        text = ' '.join(text)
    # Remove non-alphabetic characters using regular expression
    processed_text = re.sub('[^a-zA-Z]', ' ', str(text))  # Convert to string before applying regex
    return processed_text

# Apply the preprocessing function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)
#df.head()

# Lowercasing
df['text'] = df['text'].str.lower()
#df.head()

# VADER Sentiment Analysis (for label generation)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Create sentiment labels
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['opinion'] = df['text'].apply(get_sentiment)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['opinion'], test_size=0.2, stratify=df['opinion'])

# Convert labels to numerical format
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

# Oversample the minority class in the training set
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(pd.DataFrame({'text': train_texts}), pd.DataFrame({'label': train_labels}))

# Convert back to lists
train_texts_resampled = train_texts_resampled['text'].tolist()
train_labels_resampled = train_labels_resampled['label'].tolist()

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize data with maximum length
max_length = 512
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels_resampled)
val_dataset = CustomDataset(val_encodings, val_labels)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=train_labels_resampled)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    evaluation_strategy="steps",
    save_steps=50,
    eval_steps=50,
    logging_steps=100,
    learning_rate=1e-6,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Define custom Trainer class to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],
    optimizers=(AdamW(model.parameters(), lr=1e-5), None)
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Save the trained model and tokenizer
model.save_pretrained('./output_dir')
tokenizer.save_pretrained('./output_dir')

# Save model state
torch.save(model.state_dict(), './output_dir/fine_tuned_model.pth')
print("Trained model saved at:", './output_dir')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,No log,1.029385
100,1.057000,1.00264
150,1.057000,0.889357
200,0.904700,0.921962
250,0.904700,0.876397
300,0.817200,0.827315
350,0.817200,0.887509
400,0.765200,0.790888
450,0.765200,0.823006
500,0.658600,0.802351


Evaluation results: {'eval_loss': 0.7908878922462463, 'eval_runtime': 17.8232, 'eval_samples_per_second': 27.212, 'eval_steps_per_second': 3.422, 'epoch': 1.338199513381995}
Trained model saved at: ./output_dir


In [2]:
import pandas as pd
import re
import torch
import math
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, AdamW
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import RandomOverSampler

# Load the data
df = pd.read_csv("scrapped_data.csv")

# Strip whitespace from 'text' column and drop empty rows
df['text'] = df['text'].str.strip()
# Drop rows where 'text' column is empty
df.drop(df[df['text'] == ""].index, inplace=True)

def preprocess_text(text):
    if isinstance(text, float) and math.isnan(text):
        return ""  # Return empty string for NaN values
    if isinstance(text, list):
        # Convert the list to a string
        text = ' '.join(text)
    # Remove non-alphabetic characters using regular expression
    processed_text = re.sub('[^a-zA-Z]', ' ', str(text))  # Convert to string before applying regex
    return processed_text

# Apply the preprocessing function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)
#df.head()

# Lowercasing
df['text'] = df['text'].str.lower()
#df.head()

# VADER Sentiment Analysis (for label generation)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Create sentiment labels
def get_sentiment(text):
    scores = analyzer.polarity_scores(text)
    if scores['compound'] >= 0.05:
        return 'positive'
    elif scores['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['opinion'] = df['text'].apply(get_sentiment)

# Split data into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['opinion'], test_size=0.2, stratify=df['opinion'])

# Convert labels to numerical format
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_labels = [label_map[label] for label in train_labels]
val_labels = [label_map[label] for label in val_labels]

# Oversample the minority class in the training set
ros = RandomOverSampler(random_state=42)
train_texts_resampled, train_labels_resampled = ros.fit_resample(pd.DataFrame({'text': train_texts}), pd.DataFrame({'label': train_labels}))

# Convert back to lists
train_texts_resampled = train_texts_resampled['text'].tolist()
train_labels_resampled = train_labels_resampled['label'].tolist()

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Tokenize data with maximum length
max_length = 512
train_encodings = tokenizer(train_texts_resampled, truncation=True, padding=True, max_length=max_length)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True, max_length=max_length)

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels_resampled)
val_dataset = CustomDataset(val_encodings, val_labels)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=[0, 1, 2], y=train_labels_resampled)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Trainer arguments
training_args = TrainingArguments(
    output_dir='./output_dir',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    evaluation_strategy="steps",
    save_steps=50,
    eval_steps=50,
    logging_steps=100,
    learning_rate=1e-5,
    weight_decay=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

# Define custom Trainer class to handle class weights
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Early stopping callback
early_stopping = EarlyStoppingCallback(early_stopping_patience=3)

# Initialize Trainer
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],
    optimizers=(AdamW(model.parameters(), lr=1e-5), None)
)

# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()
print("Evaluation results:", eval_result)

# Save the trained model and tokenizer
model.save_pretrained('./output_dir')
tokenizer.save_pretrained('./output_dir')

# Save model state
torch.save(model.state_dict(), './output_dir/fine_tuned_model.pth')
print("Trained model saved at:", './output_dir')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,No log,0.94237
100,0.966500,0.990129
150,0.966500,0.979566
200,0.867100,0.971668


Evaluation results: {'eval_loss': 0.9423704743385315, 'eval_runtime': 17.6453, 'eval_samples_per_second': 27.486, 'eval_steps_per_second': 3.457, 'epoch': 0.48661800486618007}
Trained model saved at: ./output_dir
