# Teacher Model Training

### imports

In [4]:
!pip install evaluate #datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding

### load data

In [6]:
dataset_dict = load_dataset("shubh2ds/data-phishing-site-clf")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/98.0k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/24.5k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/450 [00:00<?, ? examples/s]

In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 450
    })
})

### Train Teacher Model

In [8]:
# Load model directly
model_path = "google-bert/bert-base-uncased" # 110M params

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "Safe", 1: "Not Safe"}
label2id = {"Safe": 0, "Not Safe": 1}
model = AutoModelForSequenceClassification.from_pretrained(model_path,
                                                           num_labels=2,
                                                           id2label=id2label,
                                                           label2id=label2id,)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.named_parameters()

<generator object Module.named_parameters at 0x7f0e703c0440>

#### Freeze base model

In [10]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight True
bert.embeddings.position_embeddings.weight True
bert.embeddings.token_type_embeddings.weight True
bert.embeddings.LayerNorm.weight True
bert.embeddings.LayerNorm.bias True
bert.encoder.layer.0.attention.self.query.weight True
bert.encoder.layer.0.attention.self.query.bias True
bert.encoder.layer.0.attention.self.key.weight True
bert.encoder.layer.0.attention.self.key.bias True
bert.encoder.layer.0.attention.self.value.weight True
bert.encoder.layer.0.attention.self.value.bias True
bert.encoder.layer.0.attention.output.dense.weight True
bert.encoder.layer.0.attention.output.dense.bias True
bert.encoder.layer.0.attention.output.LayerNorm.weight True
bert.encoder.layer.0.attention.output.LayerNorm.bias True
bert.encoder.layer.0.intermediate.dense.weight True
bert.encoder.layer.0.intermediate.dense.bias True
bert.encoder.layer.0.output.dense.weight True
bert.encoder.layer.0.output.dense.bias True
bert.encoder.layer.0.output.LayerNorm.weight True


In [11]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [12]:
# print layers
for name, param in model.named_parameters():
   print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

#### Preprocess text

In [13]:
# define text preprocessing
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [16]:
dataset_dict.data

{'train': MemoryMappedTable
 text: string
 labels: int64
 ----
 text: [["http://bazurashop.com/idex.html?sfm_from_iframe=1',300,350","hollywoodland.org/?p=29","tunnekylmyysmiddletonii.02leds.com/me4xcdste0.php\n","usa-people-search.com/Find-Carla-Brown-IA.aspx","inspire-consultants.com.my/487ygfh",...,"allwwewrestlers.com/buddy_rogers.htm","wikimapia.org/country/Mexico/Chihuahua/","landoftherightangles.blogspot.com/","oceancitymd.gov/Public_Works/airport.html","jonbarry.co.uk/"],["www.sospizza-avignon.com/javascripts/login_scr.html","forum.verygames.net/showthread.php?42812-Problème-de-Gungame/page3","games.groups.yahoo.com/group/nosferatu-ooc/","www.indiepages.com/popgun/","randsco.com/",...,"metal-archives.com/bands/Aggression/1923","techgage.com/article/lavalys_everest_ultimate_edition_30/","tngenweb.org/bradley/JulienJulianReneVs2006.html","bussiness.10017.kingdream.kz/login-error.php?login=ec1c740eac398945fbc70a3f6c4bc00f","cinematext.tv/wp-includes/js/login.alibaba.htm"],["exclus

In [18]:
# tokenize all datasetse
tokenized_data = dataset_dict.map(preprocess_function, batched=True)
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2100
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 450
    })
})

In [19]:
tokenized_data.data

{'train': MemoryMappedTable
 text: string
 labels: int64
 input_ids: list<item: int32>
   child 0, item: int32
 token_type_ids: list<item: int8>
   child 0, item: int8
 attention_mask: list<item: int8>
   child 0, item: int8
 ----
 text: [["http://bazurashop.com/idex.html?sfm_from_iframe=1',300,350","hollywoodland.org/?p=29","tunnekylmyysmiddletonii.02leds.com/me4xcdste0.php\n","usa-people-search.com/Find-Carla-Brown-IA.aspx","inspire-consultants.com.my/487ygfh",...,"allwwewrestlers.com/buddy_rogers.htm","wikimapia.org/country/Mexico/Chihuahua/","landoftherightangles.blogspot.com/","oceancitymd.gov/Public_Works/airport.html","jonbarry.co.uk/"],["www.sospizza-avignon.com/javascripts/login_scr.html","forum.verygames.net/showthread.php?42812-Problème-de-Gungame/page3","games.groups.yahoo.com/group/nosferatu-ooc/","www.indiepages.com/popgun/","randsco.com/",...,"metal-archives.com/bands/Aggression/1923","techgage.com/article/lavalys_everest_ultimate_edition_30/","tngenweb.org/bradley/Julie

In [20]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='google-bert/bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

#### Evaluation

In [21]:
# load metrics
accuracy = evaluate.load("accuracy")
auc_score = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    # get predictions
    predictions, labels = eval_pred

    # apply softmax to get probabilities
    probabilities = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)
    # use probabilities of the positive class for ROC AUC
    positive_class_probs = probabilities[:, 1]
    # compute auc
    auc = np.round(auc_score.compute(prediction_scores=positive_class_probs, references=labels)['roc_auc'],3)

    # predict most probable class
    predicted_classes = np.argmax(predictions, axis=1)
    # compute accuracy
    acc = np.round(accuracy.compute(predictions=predicted_classes, references=labels)['accuracy'],3)

    return {"Accuracy": acc, "AUC": auc}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

#### Train model

In [22]:
# hyperparameters
lr = 2e-4
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    output_dir="bert-base-uncased-phishing-classifier_teacher",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [33]:
tokenized_data["train"] , tokenized_data["train"]['text'][:5],  tokenized_data["train"]['labels'][:5]

(Dataset({
     features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
     num_rows: 2100
 }),
 ["http://bazurashop.com/idex.html?sfm_from_iframe=1',300,350",
  'hollywoodland.org/?p=29',
  'tunnekylmyysmiddletonii.02leds.com/me4xcdste0.php\\n',
  'usa-people-search.com/Find-Carla-Brown-IA.aspx',
  'inspire-consultants.com.my/487ygfh'],
 [1, 0, 1, 0, 1])

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mshubh2ds[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Auc
1,0.5028,0.383876,0.816,0.913
2,0.4094,0.337763,0.84,0.931
3,0.3545,0.31368,0.856,0.94
4,0.3592,0.357145,0.842,0.946
5,0.3503,0.342752,0.86,0.948
6,0.3485,0.290033,0.873,0.95
7,0.335,0.287699,0.873,0.95
8,0.3109,0.28967,0.864,0.95
9,0.3122,0.283608,0.873,0.951
10,0.3149,0.288283,0.867,0.952


TrainOutput(global_step=2630, training_loss=0.3597590602396106, metrics={'train_runtime': 278.1107, 'train_samples_per_second': 75.51, 'train_steps_per_second': 9.457, 'total_flos': 706603239165360.0, 'train_loss': 0.3597590602396106, 'epoch': 10.0})

### Apply Model to Validation Dataset

In [35]:
# apply model to validation dataset
predictions = trainer.predict(tokenized_data["validation"])

# Extract the logits and labels from the predictions object
logits = predictions.predictions
labels = predictions.label_ids

# Use your compute_metrics function
metrics = compute_metrics((logits, labels))
print(metrics)

{'Accuracy': np.float64(0.893), 'AUC': np.float64(0.945)}


### Push to hub

In [37]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
# push model to hub
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1743865766.50a1cdae5a28.326.0:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shubh2ds/bert-base-uncased-phishing-classifier_teacher/commit/4e9c979a635d1626f02a80849fb0a52023077a50', commit_message='End of training', commit_description='', oid='4e9c979a635d1626f02a80849fb0a52023077a50', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shubh2ds/bert-base-uncased-phishing-classifier_teacher', endpoint='https://huggingface.co', repo_type='model', repo_id='shubh2ds/bert-base-uncased-phishing-classifier_teacher'), pr_revision=None, pr_num=None)

### Run inference on new examples

In [39]:
# First, check if CUDA is available
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

# Move your model to the appropriate device
model = model.to(device)

# Tokenize the input string
input_text = "000mclogin.micloud-object-storage-xc-cos-static-web-hosting-qny.s3.us-east.cloud-object-storage.appdomain.cloud"
inputs = tokenizer(input_text, return_tensors="pt").to(device)

# Perform inference
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

# Map prediction to label
predicted_label = model.config.id2label[predictions.item()]
print(f"Predicted label: {predicted_label}")

Using GPU: Tesla T4
Predicted label: Not Safe
