In [12]:
import pandas as pd
df=pd.read_csv('final_data.csv')
df.rename(columns={'columns2':'text','0':'label'},inplace=True)
df=df[['label','text']]

In [13]:
df.columns

Index(['label', 'text'], dtype='object')

In [14]:
from sklearn.model_selection import train_test_split

# (optional) filter out any null values before creating the test, validation and training set
#df = df[df['column_name'].notnull()]

# Split dataset into training and temp (for validation and testing) - set at 15% (7.5% each)
train_df, temp_df = train_test_split(df, test_size=0.15, random_state=42)

# Split temp into validation and testing
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [15]:
from datasets import Dataset, DatasetDict

# setup your sets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [16]:
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# look at the set
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 153
    })
    validation: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 13
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__'],
        num_rows: 14
    })
})

In [17]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

label_encoder.fit(dataset_dict['train']['label'])

def encode_labels(example):
    return {'encoded_label': label_encoder.transform([example['label']])[0]}

for split in dataset_dict:
    print(split)
    dataset_dict[split] = dataset_dict[split].map(encode_labels, batched=False)

train


Map: 100%|██████████| 153/153 [00:00<00:00, 6379.77 examples/s]


validation


Map: 100%|██████████| 13/13 [00:00<00:00, 2992.31 examples/s]


test


Map: 100%|██████████| 14/14 [00:00<00:00, 2715.39 examples/s]


In [18]:
model_name = "albert/albert-base-v2"
your_path = 'HuggingFace_Model'

In [19]:
from transformers import AutoConfig

unique_labels = sorted(list(set(dataset_dict['train']['label'])))
id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

config = AutoConfig.from_pretrained(model_name)
config.id2label = id2label
config.label2id = label2id

# Verify the correct labels
print("ID to Label Mapping:", config.id2label)
print("Label to ID Mapping:", config.label2id)



ID to Label Mapping: {0: 'NotClickBait', 1: 'clickbait'}
Label to ID Mapping: {'NotClickBait': 0, 'clickbait': 1}


In [20]:
model_name

'albert/albert-base-v2'

In [21]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, config=config)

# Load model directly
#from transformers import AutoTokenizer, AutoModelForMaskedLM

#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForMaskedLM.from_pretrained(model_name,config=config)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
def filter_invalid_content(example):
    return isinstance(example['text'], str)

dataset = dataset_dict.filter(filter_invalid_content, batched=False)

def encode_data(batch):
    tokenized_inputs = tokenizer(batch["text"], padding=True, truncation=True, max_length=256)
    tokenized_inputs["labels"] = batch["encoded_label"]
    return tokenized_inputs

dataset_encoded = dataset.map(encode_data, batched=True)
dataset_encoded

Filter: 100%|██████████| 153/153 [00:00<00:00, 42030.95 examples/s]
Filter: 100%|██████████| 13/13 [00:00<00:00, 7240.20 examples/s]
Filter: 100%|██████████| 14/14 [00:00<00:00, 6059.88 examples/s]
Map: 100%|██████████| 153/153 [00:00<00:00, 4959.30 examples/s]
Map: 100%|██████████| 13/13 [00:00<00:00, 1704.95 examples/s]
Map: 100%|██████████| 14/14 [00:00<00:00, 1985.60 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 153
    })
    validation: Dataset({
        features: ['label', 'text', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 13
    })
    test: Dataset({
        features: ['label', 'text', '__index_level_0__', 'encoded_label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 14
    })
})

In [23]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [24]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

In [25]:

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
import numpy as np

label_encoder = LabelEncoder()
label_encoder.fit(unique_labels)

def per_label_accuracy(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    correct_predictions = cm.diagonal()
    label_totals = cm.sum(axis=1)
    per_label_acc = np.divide(correct_predictions, label_totals, out=np.zeros_like(correct_predictions, dtype=float), where=label_totals != 0)
    return dict(zip(labels, per_label_acc))

In [26]:

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    decoded_labels = label_encoder.inverse_transform(labels)
    decoded_preds = label_encoder.inverse_transform(preds)

    precision = precision_score(decoded_labels, decoded_preds, average='weighted')
    recall = recall_score(decoded_labels, decoded_preds, average='weighted')
    f1 = f1_score(decoded_labels, decoded_preds, average='weighted')
    acc = accuracy_score(decoded_labels, decoded_preds)

    labels_list = list(label_encoder.classes_)
    per_label_acc = per_label_accuracy(decoded_labels, decoded_preds, labels_list)

    per_label_acc_metrics = {}
    for label, accuracy in per_label_acc.items():
        label_key = f"accuracy_label_{label}"
        per_label_acc_metrics[label_key] = accuracy

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        **per_label_acc_metrics
    }

In [27]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=your_path,
    num_train_epochs=3,
    warmup_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=100,
    learning_rate=2e-5,
    save_steps=1000,
    gradient_accumulation_steps=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded['train'],
    eval_dataset=dataset_encoded['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=15, training_loss=0.7712101777394612, metrics={'train_runtime': 42.5952, 'train_samples_per_second': 10.776, 'train_steps_per_second': 0.352, 'total_flos': 578455420140.0, 'train_loss': 0.7712101777394612, 'epoch': 3.0})

In [28]:
label_encoder.classes_

array(['NotClickBait', 'clickbait'], dtype='<U12')

In [29]:
my_model_name='myclickbaitmodelv1'

In [30]:
trainer.evaluate()
trainer.save_model(my_model_name)
trainer.save_state()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [31]:
from transformers import pipeline
pipe = pipeline('text-classification', model=my_model_name)

In [32]:
example_titles = [
    "The Controversial Truth about Tech Debt",
    "A Comprehensive Guide for Getting Started with Hugging Face",
    "OpenAI GPT-4o: The New Best AI Model in the World. Like in the Movies. For Free",
    "GPT4 Omni — So much more than just a voice assistant",
    "Building Vector Databases with FastAPI and ChromaDB",
    "How Pieter Levels Makes (At Least) $210K a Month From His Laptop — With Zero Employees",
    "Which Is Better: Teachers or AI in the Classroom?",
    "How to Build Enterprise-Scale Generative AI Agents with AWS Bedrock: A Comprehensive Guide",
    "The Best Way To Start Your One-Person Business",
    "How to earn one crore in 1 days by following these 2 steps",
]

for title in example_titles:
    result = pipe(title)
    print(f"Title: {title}")
    print(result)
    print(f"Output: {result[0]['label']}")

Title: The Controversial Truth about Tech Debt
[{'label': 'clickbait', 'score': 0.5173347592353821}]
Output: clickbait
Title: A Comprehensive Guide for Getting Started with Hugging Face
[{'label': 'clickbait', 'score': 0.5692624449729919}]
Output: clickbait
Title: OpenAI GPT-4o: The New Best AI Model in the World. Like in the Movies. For Free
[{'label': 'NotClickBait', 'score': 0.538385808467865}]
Output: NotClickBait
Title: GPT4 Omni — So much more than just a voice assistant
[{'label': 'clickbait', 'score': 0.7242204546928406}]
Output: clickbait
Title: Building Vector Databases with FastAPI and ChromaDB
[{'label': 'clickbait', 'score': 0.6546621918678284}]
Output: clickbait
Title: How Pieter Levels Makes (At Least) $210K a Month From His Laptop — With Zero Employees
[{'label': 'clickbait', 'score': 0.6482016444206238}]
Output: clickbait
Title: Which Is Better: Teachers or AI in the Classroom?
[{'label': 'clickbait', 'score': 0.7300415635108948}]
Output: clickbait
Title: How to Build 