###Multilingual Causal Language Model Inference

In [1]:
# Install required libraries
!pip install transformers datasets -q

In [2]:
# Import necessary libraries
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import classification_report

In [3]:
# Mount Google Drive (optional, if datasets are stored in Drive)
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
# Paths to datasets
ideology_dataset_path = '/content/drive/MyDrive/Colab Notebooks/463/TrDataset/orientation-tr-train.tsv'
power_dataset_path = '/content/drive/MyDrive/Colab Notebooks/463/TrDataset/power-tr-train.tsv'

In [5]:
# Load the datasets
ideology_df = pd.read_csv(ideology_dataset_path, sep='\t')
power_df = pd.read_csv(power_dataset_path, sep='\t')

In [6]:
# Check for missing values
print("Missing values in ideology dataset:")
print(ideology_df.isnull().sum())
print("\nMissing values in power dataset:")
print(power_df.isnull().sum())

Missing values in ideology dataset:
id         0
speaker    0
sex        0
text       0
text_en    0
label      0
dtype: int64

Missing values in power dataset:
id         0
speaker    0
sex        0
text       0
text_en    0
label      0
dtype: int64


In [7]:
# Convert to Hugging Face datasets
ideology_dataset = Dataset.from_pandas(ideology_df)
power_dataset = Dataset.from_pandas(power_df)

In [8]:
# Display sample data
print("Sample data from ideology dataset:")
print(ideology_dataset[0])
print("\nSample data from power dataset:")
print(power_df.head())

Sample data from ideology dataset:
{'id': 'tr00000', 'speaker': 'ca2031caa4032c51980160359953d507', 'sex': 'M', 'text': 'Yeni yasama döneminin ülkemiz için, milletimiz için ve hepimiz için hayırlı ve uğurlu olmasını, başarılarla dolu yeni bir dönem olmasını temenni ediyorum. <p> Bir siyaset adamı, bir fâni için son derece şerefli bir görev olan yüce Meclisin Başkanlığına beni seçmiş olmanızdan dolayı sizlere şükranlarımı sunuyorum. Bu benim için çok büyük bir mazhariyettir ve o derece de mesuliyetli bir görevdir. Bütün gücümle güveninize ve teveccühünüze layık olmaya çalışacağım. Allah beni size ve milletimize mahcup etmesin. <p> Sayın Başkan, saygıdeğer milletvekilleri; hepimiz biliyor ve inanıyoruz ki bu Meclis büyük bir Meclistir çünkü büyük milletimizin Meclisidir. Bu Meclis gazi Meclistir ve bu sıfatı fazlasıyla hak eden dünyadaki tek Meclistir. Başta Gazi Mustafa Kemal Atatürk olmak üzere hepsini rahmetle ve şükranla andığımız ilk Mecliste görev yapanlar bir taraftan istiklal müc

In [9]:
print(ideology_dataset.column_names)
print(power_dataset.column_names)

['id', 'speaker', 'sex', 'text', 'text_en', 'label']
['id', 'speaker', 'sex', 'text', 'text_en', 'label']


In [10]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device="cuda",

)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda


In [11]:
def prompt(system_prompt,user_prompt):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

In [12]:
messages = prompt("You will be given a parliamentary speech. Determine the speaker's party ideology. Respond with one integer only, either '0' if the party leans left, or '1' if the party leans right.",
                ideology_dataset[0]["text_en"])

In [13]:
messages=[]
for i in range(10):
  messages.append(prompt("You will be given a parliamentary speech. Determine the speaker's party ideology. Respond with one integer only, either '0' if the party leans left, or '1' if the party leans right.",
                ideology_dataset[i]["text_en"]))

In [14]:
outputs=[]
for i in range(10):
  outputs.append(pipe(messages[i], max_new_tokens=1,
    pad_token_id = pipe.tokenizer.eos_token_id))

In [15]:

output = pipe(
    messages[0],
    max_new_tokens=1,
    do_sample=False
)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [16]:

print(output[0]["generated_text"][-1]['content'])
print(ideology_dataset[0]["label"])

1
1


###Create a Smaller Dataset for easier evaluation

In [26]:
from datasets import concatenate_datasets
from collections import Counter

# Define the number of samples per class
samples_per_class = 500

# Filter and sample each class
class_0 = ideology_dataset.filter(lambda x: x["label"] == 0).shuffle(seed=42).select(range(samples_per_class))
class_1 = ideology_dataset.filter(lambda x: x["label"] == 1).shuffle(seed=42).select(range(samples_per_class))

# Combine and shuffle the two classes
balanced_ideology_dataset = concatenate_datasets([class_0, class_1]).shuffle(seed=42)

# Count label distribution
label_counts = Counter(balanced_ideology_dataset["label"])
print("Balanced Dataset Label Distribution:", label_counts)


Filter:   0%|          | 0/16138 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16138 [00:00<?, ? examples/s]

Balanced Dataset Label Distribution: Counter({1: 500, 0: 500})


In [27]:
# Define the number of samples per class
samples_per_class = 500

# Filter and sample each class
class_0 = power_dataset.filter(lambda x: x["label"] == 0).shuffle(seed=42).select(range(samples_per_class))
class_1 = power_dataset.filter(lambda x: x["label"] == 1).shuffle(seed=42).select(range(samples_per_class))

# Combine and shuffle the two classes
balanced_power_dataset = concatenate_datasets([class_0, class_1]).shuffle(seed=42)

# Count label distribution
label_counts = Counter(balanced_power_dataset["label"])
print("Balanced Dataset Label Distribution:", label_counts)

Filter:   0%|          | 0/17384 [00:00<?, ? examples/s]

Filter:   0%|          | 0/17384 [00:00<?, ? examples/s]

Balanced Dataset Label Distribution: Counter({1: 500, 0: 500})


###Evaluation

In [19]:
from tqdm.auto import tqdm

def perform_inference(pipe, dataset, task, text_column):
    """
    Perform inference using LLaMA for the given task and dataset with progress tracking.
    """
    predictions = []
    true_labels = dataset["label"]

    if task == "ideology":
        system_prompt = (
            "You will be given a parliamentary speech. Determine whether the speaker's party leans left or right ideology."
            " Respond with only one integer: either '0' if the party leans left, or '1' if the party leans right. Do not include any other text."
        )
    elif task == "power":
        system_prompt = (
            "You will be given a parliamentary speech. Determine whether the speaker's party is in power or opposition. "
            "Respond with one integer only, either '0' if the party is governing, or '1' if the party is opposing. Do not include any other text."
        )
    else:
        raise ValueError("Invalid task provided.")

    # Use tqdm for progress tracking
    for example in tqdm(dataset, desc=f"Processing {task} for {text_column}", total=len(dataset),):
        user_prompt = example[text_column]
        messages = prompt(system_prompt, user_prompt)
        output = pipe(messages, max_new_tokens=1,
            pad_token_id = pipe.tokenizer.eos_token_id,
            do_sample=False)
        generated_text = output[0]["generated_text"][-1]['content']

        # Extract the last character for prediction
        try:
            prediction = int(generated_text.strip())
        except ValueError:
            prediction = -1  # Handle invalid predictions

        predictions.append(prediction)

    return predictions, true_labels


In [28]:
# Perform inference for all tasks and text types
results = {}

#for task, dataset in [("ideology", ideology_dataset.select(range(10))), ("power", power_dataset.select(range(10)))]:
for task, dataset in [("ideology", balanced_ideology_dataset), ("power", balanced_power_dataset)]:
    for text_type in ["text", "text_en"]:
        key = f"{task}_{text_type}"
        predictions, true_labels = perform_inference(pipe, dataset, task, text_type)
        results[key] = {"predictions": predictions, "true_labels": true_labels}


Processing ideology for text:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing ideology for text_en:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing power for text:   0%|          | 0/1000 [00:00<?, ?it/s]

Processing power for text_en:   0%|          | 0/1000 [00:00<?, ?it/s]

In [21]:
results

{'ideology_text': {'predictions': [0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   1,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0],
  'true_labels': [1,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   1,
   1,
   0,
   0,
   0,
   1,
   1,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   1,
   0,
   1,
   0

In [29]:
# Evaluate and display results
for key, result in results.items():
    print(f"\nEvaluation for {key}:")
    print(classification_report(result["true_labels"], result["predictions"]))



Evaluation for ideology_text:
              precision    recall  f1-score   support

           0       0.56      0.90      0.69       500
           1       0.75      0.30      0.43       500

    accuracy                           0.60      1000
   macro avg       0.66      0.60      0.56      1000
weighted avg       0.66      0.60      0.56      1000


Evaluation for ideology_text_en:
              precision    recall  f1-score   support

           0       0.67      0.74      0.70       500
           1       0.71      0.63      0.67       500

    accuracy                           0.69      1000
   macro avg       0.69      0.69      0.68      1000
weighted avg       0.69      0.69      0.68      1000


Evaluation for power_text:
              precision    recall  f1-score   support

           0       0.87      0.62      0.72       500
           1       0.70      0.90      0.79       500

    accuracy                           0.76      1000
   macro avg       0.78      0.76  