In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import transformers
import accelerate

In [4]:
# Global Variables
MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 128
TRAIN_TEST_SPLIT_RATIO = 0.1
RANDOM_STATE = 42

In [3]:
def load_and_preprocess_data(filepath, drop_na=True):
    """Loads data, checks for missing values, and optionally drops missing rows."""
    data = pd.read_csv(filepath, sep='\t')
    print(f"Dataset Statistics for {filepath}:")
    print(data['label'].value_counts(normalize=True))
    print(f"Missing Value Check for {filepath}:")
    print(data.isnull().sum())
    if drop_na:
        data = data.dropna()
    return data

In [5]:
def stratified_split(data, test_size=TRAIN_TEST_SPLIT_RATIO, random_state=RANDOM_STATE):
    """Splits the dataset into training and test sets with stratification."""
    train, test = train_test_split(
        data,
        test_size=test_size,
        stratify=data['label'],
        random_state=random_state
    )
    print("Class Distribution in Training Set:")
    print(train['label'].value_counts(normalize=True))
    print("Class Distribution in Test Set:")
    print(test['label'].value_counts(normalize=True))
    return train, test


In [6]:
def save_datasets(train, test, train_filepath, test_filepath):
    """Saves the training and test sets to CSV files."""
    train.to_csv(train_filepath, index=False)
    test.to_csv(test_filepath, index=False)
    print(f"Saved training set to {train_filepath}")
    print(f"Saved test set to {test_filepath}")


In [7]:
def prepare_datasets_for_training(train_data, test_data, tokenizer):
    """Tokenizes the data and prepares it for training."""
    def tokenize_function(examples):
        return tokenizer(examples["text_en"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

    train_dataset = Dataset.from_pandas(train_data).map(tokenize_function, batched=True)
    test_dataset = Dataset.from_pandas(test_data).map(tokenize_function, batched=True)

    train_dataset = train_dataset.remove_columns(["id", "speaker", "sex", "text", "text_en"])
    test_dataset = test_dataset.remove_columns(["id", "speaker", "sex", "text", "text_en"])

    train_dataset = train_dataset.rename_column("label", "labels")
    test_dataset = test_dataset.rename_column("label", "labels")

    train_dataset.set_format("torch")
    test_dataset.set_format("torch")

    return train_dataset, test_dataset


In [8]:
def train_model(train_dataset, test_dataset, tokenizer):
    """Trains a BERT model and evaluates it."""
    model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    training_args = TrainingArguments(
        output_dir="./results_task2",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=2,
        load_best_model_at_end=True,
        logging_dir='./logs',
        logging_steps=50
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    results = trainer.evaluate()
    print("Evaluation Results:", results)
    return model, trainer

In [9]:
def evaluate_model(trainer, test_dataset):
    """Calculates accuracy and F1 score of the model."""
    predictions = trainer.predict(test_dataset)
    preds = predictions.predictions.argmax(axis=-1)
    labels = predictions.label_ids

    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")

    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    return accuracy, f1

In [10]:
# Load dataset
power_data = load_and_preprocess_data("power-tr-train.tsv")

Dataset Statistics for power-tr-train.tsv:
label
1    0.513806
0    0.486194
Name: proportion, dtype: float64
Missing Value Check for power-tr-train.tsv:
id         0
speaker    0
sex        0
text       0
text_en    0
label      0
dtype: int64


In [11]:
# Split dataset
power_train, power_test = stratified_split(power_data)


Class Distribution in Training Set:
label
1    0.513774
0    0.486226
Name: proportion, dtype: float64
Class Distribution in Test Set:
label
1    0.514089
0    0.485911
Name: proportion, dtype: float64


In [12]:
# Save processed datasets
save_datasets(power_train, power_test, "power_train_processed.csv", "power_test_processed.csv")


Saved training set to power_train_processed.csv
Saved test set to power_test_processed.csv


In [13]:
# Tokenizer setup
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [14]:
# Prepare datasets for training
power_train_dataset, power_test_dataset = prepare_datasets_for_training(power_train, power_test, tokenizer)


Map:   0%|          | 0/15645 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

In [15]:
# Train and evaluate the model
model, trainer = train_model(power_train_dataset, power_test_dataset, tokenizer)
evaluate_model(trainer, power_test_dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


  0%|          | 0/2934 [00:00<?, ?it/s]

{'loss': 0.6978, 'grad_norm': 1.3391704559326172, 'learning_rate': 1.9659168370824816e-05, 'epoch': 0.05}
{'loss': 0.6521, 'grad_norm': 4.183454990386963, 'learning_rate': 1.9318336741649627e-05, 'epoch': 0.1}
{'loss': 0.6196, 'grad_norm': 3.2177321910858154, 'learning_rate': 1.8977505112474438e-05, 'epoch': 0.15}
{'loss': 0.5913, 'grad_norm': 5.108887195587158, 'learning_rate': 1.8636673483299253e-05, 'epoch': 0.2}
{'loss': 0.5671, 'grad_norm': 10.113988876342773, 'learning_rate': 1.8295841854124064e-05, 'epoch': 0.26}
{'loss': 0.5312, 'grad_norm': 7.7452616691589355, 'learning_rate': 1.795501022494888e-05, 'epoch': 0.31}
{'loss': 0.5617, 'grad_norm': 6.042710304260254, 'learning_rate': 1.761417859577369e-05, 'epoch': 0.36}
{'loss': 0.4899, 'grad_norm': 5.283132076263428, 'learning_rate': 1.72733469665985e-05, 'epoch': 0.41}
{'loss': 0.5376, 'grad_norm': 7.643433094024658, 'learning_rate': 1.6932515337423315e-05, 'epoch': 0.46}
{'loss': 0.4834, 'grad_norm': 8.389946937561035, 'learnin

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.4340490400791168, 'eval_runtime': 14.8085, 'eval_samples_per_second': 117.433, 'eval_steps_per_second': 7.361, 'epoch': 1.0}
{'loss': 0.4283, 'grad_norm': 6.718875408172607, 'learning_rate': 1.318336741649625e-05, 'epoch': 1.02}
{'loss': 0.3821, 'grad_norm': 15.446608543395996, 'learning_rate': 1.2842535787321065e-05, 'epoch': 1.07}
{'loss': 0.3745, 'grad_norm': 4.825876235961914, 'learning_rate': 1.2501704158145878e-05, 'epoch': 1.12}
{'loss': 0.3721, 'grad_norm': 11.287981986999512, 'learning_rate': 1.2160872528970689e-05, 'epoch': 1.18}
{'loss': 0.3524, 'grad_norm': 10.542778015136719, 'learning_rate': 1.1820040899795502e-05, 'epoch': 1.23}
{'loss': 0.3744, 'grad_norm': 10.010162353515625, 'learning_rate': 1.1479209270620315e-05, 'epoch': 1.28}
{'loss': 0.3746, 'grad_norm': 6.4129228591918945, 'learning_rate': 1.1138377641445126e-05, 'epoch': 1.33}
{'loss': 0.3448, 'grad_norm': 7.605463981628418, 'learning_rate': 1.079754601226994e-05, 'epoch': 1.38}
{'loss': 0.3424,

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.4357866942882538, 'eval_runtime': 14.7162, 'eval_samples_per_second': 118.169, 'eval_steps_per_second': 7.407, 'epoch': 2.0}
{'loss': 0.2945, 'grad_norm': 6.184322834014893, 'learning_rate': 6.366734832992503e-06, 'epoch': 2.04}
{'loss': 0.2377, 'grad_norm': 6.779289722442627, 'learning_rate': 6.025903203817315e-06, 'epoch': 2.1}
{'loss': 0.2449, 'grad_norm': 8.739617347717285, 'learning_rate': 5.685071574642127e-06, 'epoch': 2.15}
{'loss': 0.2649, 'grad_norm': 14.639701843261719, 'learning_rate': 5.3442399454669404e-06, 'epoch': 2.2}
{'loss': 0.2726, 'grad_norm': 7.447010040283203, 'learning_rate': 5.003408316291752e-06, 'epoch': 2.25}
{'loss': 0.2405, 'grad_norm': 10.573034286499023, 'learning_rate': 4.662576687116564e-06, 'epoch': 2.3}
{'loss': 0.2668, 'grad_norm': 9.638249397277832, 'learning_rate': 4.321745057941377e-06, 'epoch': 2.35}
{'loss': 0.239, 'grad_norm': 8.02473258972168, 'learning_rate': 3.98091342876619e-06, 'epoch': 2.4}
{'loss': 0.2478, 'grad_norm': 5

  0%|          | 0/109 [00:00<?, ?it/s]

{'eval_loss': 0.5161563754081726, 'eval_runtime': 14.8866, 'eval_samples_per_second': 116.816, 'eval_steps_per_second': 7.322, 'epoch': 3.0}
{'train_runtime': 8151.3009, 'train_samples_per_second': 5.758, 'train_steps_per_second': 0.36, 'train_loss': 0.3748231405454504, 'epoch': 3.0}


  0%|          | 0/109 [00:00<?, ?it/s]

Evaluation Results: {'eval_loss': 0.4340490400791168, 'eval_runtime': 15.1618, 'eval_samples_per_second': 114.696, 'eval_steps_per_second': 7.189, 'epoch': 3.0}


  0%|          | 0/109 [00:00<?, ?it/s]

Accuracy: 0.7918
F1 Score: 0.7913


(0.7918343875790684, 0.7912730486144101)

In [16]:
# Save the fine-tuned model
model.save_pretrained("./task2_multilingual_bert")
tokenizer.save_pretrained("./task2_multilingual_bert")
print("Model and tokenizer saved.")

Model and tokenizer saved.


In [17]:
# Load required libraries
import pandas as pd
from transformers import pipeline
from datasets import Dataset

# Load the test dataset for Task 2
power_test = pd.read_csv("power_test_processed.csv")

# Initialize the causal inference pipeline
causal_pipeline = pipeline(
    "text-generation",
    model="bigscience/bloomz-1b1",
    device=0  # Use GPU for inference
)

Device set to use cuda:0


In [18]:
# Limit input text to the first 500 characters to reduce memory usage
def limit_text_length(dataset, column_name="text_en"):
    return dataset.map(lambda x: {column_name: x[column_name][:500]})

# Generate prompts and perform inference in batches
def generate_texts(batch, column_name="text_en", prompt_template="Classify the text"):
    prompts = [prompt_template.format(text=text) for text in batch[column_name]]
    outputs = causal_pipeline(prompts, max_new_tokens=10)
    return {"predictions": [output[0]["generated_text"].strip() for output in outputs]}

# Process predictions into binary labels
def process_predictions(predictions):
    binary_results = []
    for pred in predictions:
        if "0" in pred:
            binary_results.append(0)
        elif "1" in pred:
            binary_results.append(1)
        else:
            binary_results.append(None)  # Model uncertain
    return binary_results

In [19]:
# Convert the dataset to HuggingFace Dataset format
turkish_test_dataset = Dataset.from_pandas(power_test)

# Limit text length for Turkish column
turkish_test_dataset = limit_text_length(turkish_test_dataset, column_name="text")

# Define the Turkish prompt
turkish_prompt = "Aşağıdaki meclis konuşmasına dayanarak, konuşmacının partisinin hükümette (0) ya da muhalefette (1) olduğunu belirtiniz:\n\n{text}\n\nCevap:"

# Perform inference
results_turkish = turkish_test_dataset.map(
    lambda batch: generate_texts(batch, column_name="text", prompt_template=turkish_prompt),
    batched=True,
    batch_size=8
)

# Process binary predictions for Turkish texts
binary_predictions_turkish = process_predictions(results_turkish["predictions"])
results_turkish = results_turkish.add_column("binary_predictions", binary_predictions_turkish)

# Calculate accuracy for Turkish predictions
correct_predictions_turkish = [
    binary == label for binary, label in zip(results_turkish["binary_predictions"], results_turkish["label"])
]
accuracy_turkish = sum(correct_predictions_turkish) / len(correct_predictions_turkish)

# Save results to CSV
results_turkish_df = results_turkish.to_pandas()
results_turkish_df.to_csv("task2_bloomz_turkish_results_binary.csv", index=False)

print(f"Zero-Shot Inference Accuracy (Turkish - Task 2): {accuracy_turkish:.2%}")


Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [20]:
# Convert the dataset to HuggingFace Dataset format
english_test_dataset = Dataset.from_pandas(power_test)

# Limit text length for English column
english_test_dataset = limit_text_length(english_test_dataset, column_name="text_en")

# Define the English prompt
english_prompt = "Based on the following parliamentary speech, classify whether the speaker's party is governing (0) or opposition (1):\n\n{text}\n\nAnswer:"

# Perform inference
results_english = english_test_dataset.map(
    lambda batch: generate_texts(batch, column_name="text_en", prompt_template=english_prompt),
    batched=True,
    batch_size=8
)

# Process binary predictions for English texts
binary_predictions_english = process_predictions(results_english["predictions"])
results_english = results_english.add_column("binary_predictions", binary_predictions_english)

# Calculate accuracy for English predictions
correct_predictions_english = [
    binary == label for binary, label in zip(results_english["binary_predictions"], results_english["label"])
]
accuracy_english = sum(correct_predictions_english) / len(correct_predictions_english)

# Save results to CSV
results_english_df = results_english.to_pandas()
results_english_df.to_csv("task2_bloomz_english_results_binary.csv", index=False)

print(f"Zero-Shot Inference Accuracy (English - Task 2): {accuracy_english:.2%}")


Map:   0%|          | 0/1739 [00:00<?, ? examples/s]



Map:   0%|          | 0/1739 [00:00<?, ? examples/s]

Map:   0%|          | 0/1614 [00:00<?, ? examples/s]

Zero-Shot Inference Accuracy (English): 41.82%
