In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import transformers
import accelerate

In [2]:
# Global Variables
MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 128
TRAIN_TEST_SPLIT_RATIO = 0.1
RANDOM_STATE = 42

In [3]:
def load_and_preprocess_data(filepath, drop_na=True):
    """Loads data, checks for missing values, and optionally drops missing rows."""
    data = pd.read_csv(filepath, sep='\t')
    print(f"Dataset Statistics for {filepath}:")
    print(data['label'].value_counts(normalize=True))
    print(f"Missing Value Check for {filepath}:")
    print(data.isnull().sum())
    if drop_na:
        data = data.dropna()
    return data

In [4]:
def stratified_split(data, test_size=TRAIN_TEST_SPLIT_RATIO, random_state=RANDOM_STATE):
    """Splits the dataset into training and test sets with stratification."""
    train, test = train_test_split(
        data,
        test_size=test_size,
        stratify=data['label'],
        random_state=random_state
    )
    print("Class Distribution in Training Set:")
    print(train['label'].value_counts(normalize=True))
    print("Class Distribution in Test Set:")
    print(test['label'].value_counts(normalize=True))
    return train, test


In [5]:
def save_datasets(train, test, train_filepath, test_filepath):
    """Saves the training and test sets to CSV files."""
    train.to_csv(train_filepath, index=False)
    test.to_csv(test_filepath, index=False)
    print(f"Saved training set to {train_filepath}")
    print(f"Saved test set to {test_filepath}")


In [6]:
def prepare_datasets_for_training(train_data, test_data, tokenizer):
    """Tokenizes the data and prepares it for training."""
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

    train_dataset = Dataset.from_pandas(train_data).map(tokenize_function, batched=True)
    test_dataset = Dataset.from_pandas(test_data).map(tokenize_function, batched=True)

    train_dataset = train_dataset.remove_columns(["id", "speaker", "sex", "text", "text_en"])
    test_dataset = test_dataset.remove_columns(["id", "speaker", "sex", "text", "text_en"])

    train_dataset = train_dataset.rename_column("label", "labels")
    test_dataset = test_dataset.rename_column("label", "labels")

    train_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, test_dataset


In [7]:
def train_model(train_dataset, test_dataset, tokenizer):
    """Trains a BERT model and evaluates it."""
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    training_args = TrainingArguments(
        output_dir="./results_task1",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        load_best_model_at_end=True,
        logging_dir='./logs',
        logging_steps=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    results = trainer.evaluate()
    print("Evaluation Results:", results)
    return model, trainer


In [8]:
def evaluate_model(trainer, test_dataset):
    """Calculates accuracy and F1 score of the model."""
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(axis=-1)
    labels = predictions.label_ids

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return accuracy, f1


In [9]:
# Load dataset
orientation_data = load_and_preprocess_data("orientation-tr-train.tsv")

Dataset Statistics for orientation-tr-train.tsv:
label
1    0.581856
0    0.418144
Name: proportion, dtype: float64
Missing Value Check for orientation-tr-train.tsv:
id         0
speaker    0
sex        0
text       0
text_en    0
label      0
dtype: int64


In [10]:
# Split dataset
orientation_train, orientation_test = stratified_split(orientation_data)


Class Distribution in Training Set:
label
1    0.581865
0    0.418135
Name: proportion, dtype: float64
Class Distribution in Test Set:
label
1    0.581784
0    0.418216
Name: proportion, dtype: float64


In [11]:
# Save processed datasets
save_datasets(orientation_train, orientation_test, "orientation_train_processed.csv", "orientation_test_processed.csv")


Saved training set to orientation_train_processed.csv
Saved test set to orientation_test_processed.csv


In [12]:
# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [13]:
# Prepare datasets for training
orientation_train_dataset, orientation_test_dataset = prepare_datasets_for_training(orientation_train, orientation_test, tokenizer)


Map:   0%|          | 0/14524 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

In [14]:
# Train and evaluate the model
model, trainer = train_model(orientation_train_dataset, orientation_test_dataset, tokenizer)
evaluate_model(trainer, orientation_test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/2724 [00:00<?, ?it/s]

{'loss': 0.6773, 'grad_norm': 1.002029299736023, 'learning_rate': 1.9632892804698974e-05, 'epoch': 0.06}
{'loss': 0.6604, 'grad_norm': 5.598144054412842, 'learning_rate': 1.9265785609397945e-05, 'epoch': 0.11}
{'loss': 0.6692, 'grad_norm': 3.279832601547241, 'learning_rate': 1.8898678414096917e-05, 'epoch': 0.17}
{'loss': 0.6804, 'grad_norm': 2.027076482772827, 'learning_rate': 1.853157121879589e-05, 'epoch': 0.22}
{'loss': 0.6462, 'grad_norm': 3.1041321754455566, 'learning_rate': 1.816446402349486e-05, 'epoch': 0.28}
{'loss': 0.6324, 'grad_norm': 3.934321165084839, 'learning_rate': 1.7797356828193833e-05, 'epoch': 0.33}
{'loss': 0.5761, 'grad_norm': 7.792826175689697, 'learning_rate': 1.7430249632892805e-05, 'epoch': 0.39}
{'loss': 0.6047, 'grad_norm': 3.529942274093628, 'learning_rate': 1.7063142437591777e-05, 'epoch': 0.44}
{'loss': 0.5381, 'grad_norm': 8.477499008178711, 'learning_rate': 1.6696035242290752e-05, 'epoch': 0.5}
{'loss': 0.5527, 'grad_norm': 8.729421615600586, 'learnin

  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 0.5300309062004089, 'eval_runtime': 13.5065, 'eval_samples_per_second': 119.498, 'eval_steps_per_second': 7.478, 'epoch': 1.0}
{'loss': 0.4917, 'grad_norm': 7.45496129989624, 'learning_rate': 1.302496328928047e-05, 'epoch': 1.05}
{'loss': 0.4985, 'grad_norm': 6.349391460418701, 'learning_rate': 1.2657856093979443e-05, 'epoch': 1.1}
{'loss': 0.4859, 'grad_norm': 7.028117656707764, 'learning_rate': 1.2290748898678415e-05, 'epoch': 1.16}
{'loss': 0.4648, 'grad_norm': 6.146389961242676, 'learning_rate': 1.1923641703377387e-05, 'epoch': 1.21}
{'loss': 0.4128, 'grad_norm': 8.85926342010498, 'learning_rate': 1.155653450807636e-05, 'epoch': 1.27}
{'loss': 0.4867, 'grad_norm': 8.730701446533203, 'learning_rate': 1.1189427312775332e-05, 'epoch': 1.32}
{'loss': 0.4321, 'grad_norm': 11.654152870178223, 'learning_rate': 1.0822320117474303e-05, 'epoch': 1.38}
{'loss': 0.4368, 'grad_norm': 7.543253421783447, 'learning_rate': 1.0455212922173275e-05, 'epoch': 1.43}
{'loss': 0.4004, 'grad_

  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 0.47719788551330566, 'eval_runtime': 13.5871, 'eval_samples_per_second': 118.789, 'eval_steps_per_second': 7.434, 'epoch': 2.0}
{'loss': 0.3583, 'grad_norm': 8.75990104675293, 'learning_rate': 6.417033773861968e-06, 'epoch': 2.04}
{'loss': 0.3399, 'grad_norm': 11.200645446777344, 'learning_rate': 6.049926578560941e-06, 'epoch': 2.09}
{'loss': 0.3754, 'grad_norm': 10.269316673278809, 'learning_rate': 5.682819383259912e-06, 'epoch': 2.15}
{'loss': 0.3442, 'grad_norm': 6.831853866577148, 'learning_rate': 5.3157121879588845e-06, 'epoch': 2.2}
{'loss': 0.3268, 'grad_norm': 6.2501702308654785, 'learning_rate': 4.9486049926578564e-06, 'epoch': 2.26}
{'loss': 0.3294, 'grad_norm': 20.18618392944336, 'learning_rate': 4.581497797356828e-06, 'epoch': 2.31}
{'loss': 0.3316, 'grad_norm': 2.5683176517486572, 'learning_rate': 4.2143906020558e-06, 'epoch': 2.37}
{'loss': 0.3499, 'grad_norm': 4.6725897789001465, 'learning_rate': 3.847283406754773e-06, 'epoch': 2.42}
{'loss': 0.3415, 'grad_

  0%|          | 0/101 [00:00<?, ?it/s]

{'eval_loss': 0.48130175471305847, 'eval_runtime': 13.583, 'eval_samples_per_second': 118.825, 'eval_steps_per_second': 7.436, 'epoch': 3.0}
{'train_runtime': 2099.6034, 'train_samples_per_second': 20.752, 'train_steps_per_second': 1.297, 'train_loss': 0.4575502054981898, 'epoch': 3.0}


  0%|          | 0/101 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.47719788551330566, 'eval_runtime': 13.7051, 'eval_samples_per_second': 117.766, 'eval_steps_per_second': 7.369, 'epoch': 3.0}


  0%|          | 0/101 [00:00<?, ?it/s]

Accuracy: 0.7862
F1 Score: 0.7851


(0.7862453531598513, 0.7851214744524638)

In [15]:
# Save the fine-tuned model
model.save_pretrained("./task1_multilingual_bert")
tokenizer.save_pretrained("./task1_multilingual_bert")
print("Model and tokenizer saved.")

Model and tokenizer saved.


In [16]:
# Load required libraries
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Load the test dataset
orientation_test = pd.read_csv("orientation_test_processed.csv")

# Initialize the causal inference pipeline
causal_pipeline = pipeline(
    "text-generation",
    model="bigscience/bloomz-1b1",
    device=0  # Use GPU for inference
)

Device set to use cuda:0


In [17]:
# Limit input text to the first 500 characters to reduce memory usage
def limit_text_length(dataset, column_name="text"):
    return dataset.map(lambda x: {column_name: x[column_name][:500]})

# Generate prompts and perform inference in batches
def generate_texts(batch, column_name="text", prompt_template="Classify the text"):
    prompts = [prompt_template.format(text=text) for text in batch[column_name]]
    outputs = causal_pipeline(prompts, max_new_tokens=10)
    return {"predictions": [output[0]["generated_text"].strip() for output in outputs]}

# Process predictions into binary labels
def process_predictions(predictions):
    binary_results = []
    for pred in predictions:
        if "0" in pred:
            binary_results.append(0)
        elif "1" in pred:
            binary_results.append(1)
        else:
            binary_results.append(None)  # Model uncertain
    return binary_results


In [18]:
# Convert the dataset to HuggingFace Dataset format
turkish_test_dataset = Dataset.from_pandas(orientation_test)

# Limit text length for Turkish column
turkish_test_dataset = limit_text_length(turkish_test_dataset, column_name="text")

# Define the Turkish prompt
turkish_prompt = "Aşağıdaki meclis konuşmasına dayanarak, konuşmacının partisinin sol (0) ya da sağ (1) eğilimli olduğunu belirtiniz:\n\n{text}\n\nCevap:"

# Perform inference
results_turkish = turkish_test_dataset.map(
    lambda batch: generate_texts(batch, column_name="text", prompt_template=turkish_prompt),
    batched=True,
    batch_size=8
)

# Process binary predictions for Turkish texts
binary_predictions_turkish = process_predictions(results_turkish["predictions"])
results_turkish = results_turkish.add_column("binary_predictions", binary_predictions_turkish)

# Calculate accuracy for Turkish predictions
correct_predictions_turkish = [
    binary == label for binary, label in zip(results_turkish["binary_predictions"], results_turkish["label"])
]
accuracy_turkish = sum(correct_predictions_turkish) / len(correct_predictions_turkish)

# Save results to CSV
results_turkish_df = results_turkish.to_pandas()
results_turkish_df.to_csv("task1_bloomz_turkish_results_binary.csv", index=False)

print(f"Zero-Shot Inference Accuracy (Turkish): {accuracy_turkish:.2%}")


Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Zero-Shot Inference Accuracy (Turkish): 41.82%


In [19]:
# Convert the dataset to HuggingFace Dataset format
english_test_dataset = Dataset.from_pandas(orientation_test)

# Limit text length for English column
english_test_dataset = limit_text_length(english_test_dataset, column_name="text_en")

# Define the English prompt
english_prompt = "Based on the following parliamentary speech, classify whether the speaker's party leans left (0) or right (1):\n\n{text}\n\nAnswer:"

# Perform inference
results_english = english_test_dataset.map(
    lambda batch: generate_texts(batch, column_name="text_en", prompt_template=english_prompt),
    batched=True,
    batch_size=8
)

# Process binary predictions for English texts
binary_predictions_english = process_predictions(results_english["predictions"])
results_english = results_english.add_column("binary_predictions", binary_predictions_english)

# Calculate accuracy for English predictions
correct_predictions_english = [
    binary == label for binary, label in zip(results_english["binary_predictions"], results_english["label"])
]
accuracy_english = sum(correct_predictions_english) / len(correct_predictions_english)

# Save results to CSV
results_english_df = results_english.to_pandas()
results_english_df.to_csv("task1_bloomz_english_results_binary.csv", index=False)

print(f"Zero-Shot Inference Accuracy (English): {accuracy_english:.2%}")


Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

Zero-Shot Inference Accuracy (English): 41.82%
