In [None]:
# Install required libraries
!pip install transformers[torch] datasets pandas openpyxl scikit-learn

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

try:
    df = pd.read_excel('/content/for_classifier (1).xlsx')
except FileNotFoundError:
    print("Error: Make sure 'my_keywords.xlsx' is uploaded to your Colab session.")
    # Add a dummy dataframe to allow the rest of the code to run for demonstration
    df = pd.DataFrame({
        'keyword': ['sample good keyword', 'sample bad keyword'],
        'score': [1, 0]
    })


# Prepare the DataFrame for Hugging Face
# The libraries expect specific column names: 'text' and 'label'
df = df.rename(columns={'keyword': 'text', 'score': 'label'})

# Ensure data types are correct
df['text'] = df['text'].astype(str)
df['label'] = df['label'].astype(int)

# Drop any rows where the text might be empty
df.dropna(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"Successfully loaded and prepared {len(df)} keywords.")
print("\nData Preview:")
print(df.head())
print("\nLabel Distribution:")
print(df['label'].value_counts())

Successfully loaded and prepared 27851 keywords.

Data Preview:
                text  label
0           backpack      0
1    xbox controller      1
2    bose headphones      1
3        accessories      0
4  bluetooth speaker      1

Label Distribution:
label
1    17570
0    10281
Name: count, dtype: int64


In [None]:
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

# Convert the pandas DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Combine them into a single DatasetDict object
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

print("\nDataset structure:")
print(dataset_dict)

Training set size: 25065
Testing set size: 2786

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 25065
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2786
    })
})


In [None]:
from transformers import AutoTokenizer

# We'll use DistilBERT
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a function to tokenize the text
def tokenize_function(examples):
    # padding="max_length" ensures all inputs are the same size
    # truncation=True ensures inputs longer than the model can handle are cut down
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenization to the entire dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

print("\nTokenization complete. Example of a tokenized input:")
print(tokenized_datasets['train'][0])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/25065 [00:00<?, ? examples/s]

Map:   0%|          | 0/2786 [00:00<?, ? examples/s]


Tokenization complete. Example of a tokenized input:
{'text': 'jawbone headphones', 'label': 1, '__index_level_0__': 6042, 'input_ids': [101, 5730, 14417, 2132, 19093, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
import math
import os


# This tells the environment to completely disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Calculate the number of steps in one epoch
batch_size = 16
steps_per_epoch = math.ceil(len(train_df) / batch_size)

# --- TRAINING ARGUMENTS WITH THE FIX ---
training_args = TrainingArguments(
    output_dir="./results",

    # Core training parameters
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    # We will still evaluate and save
    save_steps=steps_per_epoch,
    eval_steps=steps_per_epoch,

    # Standard parameters
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,

    # This explicitly tells the trainer not to use any reporting tools like wandb
    report_to="none",
)

# Define the function to compute metrics
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Create the Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

# --- Start Training! ---
trainer.train()

# --- Evaluate the final model ---
final_evaluation = trainer.evaluate()
print("\nFinal Evaluation of the model from the last epoch:")
print(final_evaluation)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Step,Training Loss
100,0.6589
200,0.6041
300,0.5731
400,0.5744
500,0.5107
600,0.539
700,0.5167
800,0.5048
900,0.4839
1000,0.4926



Final Evaluation of the model from the last epoch:
{'eval_loss': 0.5171730518341064, 'eval_accuracy': 0.7925340990667624, 'eval_runtime': 40.2202, 'eval_samples_per_second': 69.269, 'eval_steps_per_second': 4.351, 'epoch': 3.0}


In [None]:
from transformers import pipeline

# Save the fine-tuned model and tokenizer to a directory
model_save_path = "./sensical_keyword_classifier"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"\nModel saved to {model_save_path}")

# --- INFERENCE ---
# Load your custom model from the saved directory for easy use
classifier = pipeline("text-classification", model=model_save_path)

# Example keywords to test your new classifier
new_keywords_to_test = [
    "lenovo thinkpad laptop",
    "red blue for sale",
    "gaming laptop under 1000",
    "accessories for thinkpad x1",
    "shipping free code now",
    "best business computer",
    "computer for business best" # Grammatically awkward
]

results = classifier(new_keywords_to_test)

# Print the results in a clean format
for keyword, result in zip(new_keywords_to_test, results):
    # The model outputs 'LABEL_0' or 'LABEL_1'
    predicted_class = int(result['label'].split('_')[1])
    confidence = result['score']
    print(f"Keyword: '{keyword}' \t-> Predicted: {predicted_class} (Confidence: {confidence:.4f})")


Model saved to ./sensical_keyword_classifier


Device set to use cuda:0


Keyword: 'lenovo thinkpad laptop' 	-> Predicted: 1 (Confidence: 0.9764)
Keyword: 'red blue for sale' 	-> Predicted: 0 (Confidence: 0.9916)
Keyword: 'gaming laptop under 1000' 	-> Predicted: 0 (Confidence: 0.9914)
Keyword: 'accessories for thinkpad x1' 	-> Predicted: 1 (Confidence: 0.9159)
Keyword: 'shipping free code now' 	-> Predicted: 0 (Confidence: 0.9865)
Keyword: 'best business computer' 	-> Predicted: 1 (Confidence: 0.8849)
Keyword: 'computer for business best' 	-> Predicted: 1 (Confidence: 0.7882)


In [None]:
# --- Code to Analyze Mistakes ---

# Get predictions on the test set
predictions = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Add predictions to your original test DataFrame
test_df['predicted_label'] = predicted_labels
test_df['is_correct'] = (test_df['label'] == test_df['predicted_label'])

# Create a new DataFrame of only the mistakes
mistakes_df = test_df[test_df['is_correct'] == False]

print(f"The model made {len(mistakes_df)} mistakes on the test set.")
print("\n--- EXAMPLES OF MISTAKES ---")

# Print some examples for you to review
# FP = False Positive (Predicted 1, was 0)
# FN = False Negative (Predicted 0, was 1)
false_positives = mistakes_df[mistakes_df['label'] == 0]
false_negatives = mistakes_df[mistakes_df['label'] == 1]

print("\nKeywords it thought were good (but were bad):")
print(false_positives.head(10))

print("\nKeywords it thought were bad (but were good):")
print(false_negatives.head(10))

The model made 578 mistakes on the test set.

--- EXAMPLES OF MISTAKES ---

Keywords it thought were good (but were bad):
                                               text  label  predicted_label  \
19723                            charging keyboards      0                1   
19272  our best thin and light laptops with windows      0                1   
24622       intel® processor chrome tablets laptops      0                1   
25461   everyday use and entertainment fhd desktops      0                1   
755              wireless access point for business      0                1   
7973                           zagg battery charger      0                1   
15190                                 dual adapters      0                1   
17333                  earphones for android phones      0                1   
5760                              apc smartups 1500      0                1   
18724                     portable laptop mouse pad      0                1   

       i