In [35]:
pip install transformers datasets torch



In [36]:
import os
import pandas as pd
import torch
from transformers import BartTokenizer, BartForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"



In [37]:
os.environ['CUDA_LAUNCH_BLOCKING']="1"
os.environ['TORCH_USE_CUDA_DSA'] = "1"

In [38]:
# Load the dataset
df = pd.read_csv("/content/df_file.csv")  # Replace with your file path

In [39]:
df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [40]:
# Create the mapping dictionary
label_mapping = {
    0: "Politics",
    1: "Sport",
    2: "Technology",
    3: "Entertainment",
    4: "Business"
}

# Rename the labels using the mapping
df['Label'] = df['Label'].map(label_mapping)

df.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,Politics
1,Army chiefs in regiments decision\n \n Militar...,Politics
2,Howard denies split over ID cards\n \n Michael...,Politics
3,Observers to monitor UK election\n \n Minister...,Politics
4,Kilroy names election seat target\n \n Ex-chat...,Politics


In [41]:

df["Label"].values

array(['Politics', 'Politics', 'Politics', ..., 'Business', 'Business',
       'Business'], dtype=object)

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2225 non-null   object
 1   Label   2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [43]:
# Split dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Text'], df['Label'], test_size=0.2, random_state=42
)

In [44]:
# Tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
train_encodings = tokenizer(list(train_texts), truncation=True, padding="max_length", max_length=1024)
val_encodings = tokenizer(list(val_texts), truncation=True, padding="max_length", max_length=1024)

In [45]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
val_labels = label_encoder.transform(val_labels)

In [46]:
# Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [47]:
train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [48]:
# Model
model = BartForSequenceClassification.from_pretrained(
    "facebook/bart-base",
    num_labels=len(label_encoder.classes_),
    ignore_mismatched_sizes=True
)


Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=5,  # Use 1 epoch for debugging
    weight_decay=0.01,
    logging_dir='./logs',
    fp16=False  # Disable mixed precision for CPU
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

# Train
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.071499
2,No log,0.074401
3,No log,0.074363
4,0.874200,0.072924


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


TrainOutput(global_step=555, training_loss=0.7943000131899172, metrics={'train_runtime': 2979.8211, 'train_samples_per_second': 2.987, 'train_steps_per_second': 0.186, 'total_flos': 5417448814411776.0, 'train_loss': 0.7943000131899172, 'epoch': 4.961797752808989})

In [50]:
model.save_pretrained('./fine_tuned_bart')
tokenizer.save_pretrained('./fine_tuned_bart')


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


('./fine_tuned_bart/tokenizer_config.json',
 './fine_tuned_bart/special_tokens_map.json',
 './fine_tuned_bart/vocab.json',
 './fine_tuned_bart/merges.txt',
 './fine_tuned_bart/added_tokens.json')

In [51]:
from transformers import pipeline

# Load fine-tuned model
zero_shot_pipeline = pipeline("zero-shot-classification", model="./fine_tuned_bart")

# Example prediction
result = zero_shot_pipeline("the president will win!", candidate_labels=label_encoder.classes_)
print(result)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4'}. The number of labels wil be overwritten to 5.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4'}. The number of labels wil be overwritten to 5.
Device set to use cuda:0
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'the president will win!', 'labels': ['Technology', 'Business', 'Entertainment', 'Politics', 'Sport'], 'scores': [0.5973303914070129, 0.12241818755865097, 0.12136576324701309, 0.08899760246276855, 0.0698881521821022]}


In [52]:
# Example prediction
result = zero_shot_pipeline("AI will win the fight against disease", candidate_labels=label_encoder.classes_)
print(result)


{'sequence': 'AI will win the fight against disease', 'labels': ['Technology', 'Entertainment', 'Business', 'Sport', 'Politics'], 'scores': [0.29949334263801575, 0.2257085144519806, 0.19850040972232819, 0.15404586493968964, 0.12225181609392166]}


In [53]:
text=' A US television network will screen a celebrity TV special to benefit the tsunami relief effort in South Asia.  NBC will encourage viewer donations during an hour-long show featuring musical performances on 15 January. Actress Sandra Bullock has donated $1m (Ã‚Â£525,000) to The American Red Cross and actor Leonardo DiCaprio pledged a "sizable" aid contribution to Unicef. Meanwhile 70 Hong Kong music and movie stars re-recorded We Are the World in Mandarin and Cantonese to raise funds. '

In [54]:
# Example prediction
result = zero_shot_pipeline(text, candidate_labels=label_encoder.classes_)
print(result)


{'sequence': ' A US television network will screen a celebrity TV special to benefit the tsunami relief effort in South Asia.  NBC will encourage viewer donations during an hour-long show featuring musical performances on 15 January. Actress Sandra Bullock has donated $1m (Ã‚Â£525,000) to The American Red Cross and actor Leonardo DiCaprio pledged a "sizable" aid contribution to Unicef. Meanwhile 70 Hong Kong music and movie stars re-recorded We Are the World in Mandarin and Cantonese to raise funds. ', 'labels': ['Technology', 'Politics', 'Business', 'Sport', 'Entertainment'], 'scores': [0.22928592562675476, 0.19950509071350098, 0.19780324399471283, 0.19117580354213715, 0.18223001062870026]}


In [55]:
# Evaluate the model on the validation dataset
evaluation_results = trainer.evaluate()

# Print evaluation metrics
print("Evaluation Results:")
for key, value in evaluation_results.items():
    print(f"{key}: {value:.4f}")

Evaluation Results:
eval_loss: 0.0729
eval_runtime: 39.6055
eval_samples_per_second: 11.2360
eval_steps_per_second: 5.6310
epoch: 4.9618


In [56]:

val_dataframe = df.sample(n=100, random_state=42)  # random_state for reproducibility

val_dataframe.head()

Unnamed: 0,Text,Label
414,David Blunkett in quotes\n \n David Blunkett -...,Politics
420,Benitez issues warning to Gerrard\n \n Liverpo...,Sport
1644,Brookside creator's Channel 4 bid\n \n The cre...,Entertainment
416,Brown visits slum on Africa trip\n \n Chancell...,Politics
1232,Gritty return for Prince of Persia\n \n Still ...,Technology


In [57]:

# Combine all labels to fit the label encoder
all_labels = pd.concat([df['Label'], val_dataframe['Label']]).unique()
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

In [58]:
# Evaluate the model on the validation set
correct_predictions = 0

for _, row in val_dataframe.iterrows():
    text = row['Text']
    true_label = row['Label']

    # Ensure the true label is correctly encoded as a string
    if true_label not in label_encoder.classes_:
        print(f"Unseen label in validation data: {true_label}")
        continue  # Skip rows with unseen labels

    # Predict using the zero-shot pipeline
    result = zero_shot_pipeline(text, candidate_labels=label_encoder.classes_)
    predicted_label = result['labels'][0]  # Get the top predicted label

    # Check if the prediction is correct
    if predicted_label == true_label:
        correct_predictions += 1

# Calculate accuracy
total_predictions = len(val_dataframe)
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy:.4f}")
print(f"Correct Predictions: {correct_predictions}/{total_predictions}")

Accuracy: 0.4000
Correct Predictions: 40/100
