In [None]:
# Install required libraries
%pip install -U transformers accelerate bitsandbytes peft trl datasets --quiet

In [1]:
import os
import pandas as pd
import re
import string
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from transformers import TrainingArguments, Trainer
from tqdm import tqdm
import huggingface_hub

#Specify a read token
hf_token = "xyz"
os.environ["HUGGINGFACE_TOKEN"] = hf_token

os.environ["WANDB_DISABLED"] = "true"


huggingface_hub.login(token=os.environ["HUGGINGFACE_TOKEN"])


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Read the base model and Tokenizer.

In [2]:

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)


model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=hf_token, trust_remote_code=True)
tokenizer.add_bos_token = True
tokenizer.add_eos_token = True
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="auto",
    use_auth_token=hf_token
)

model.gradient_checkpointing_enable()
model.config.use_cache = False



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

# Data PreProcessing

In [3]:
import random
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
#     text = re.sub(r'http\S+|www\S+|https\S+', 'URLfound', text, flags=re.MULTILINE)
    text = text.lower()
    return text

def ml_preprocess(df):
    df['subject'] = df['subject'].fillna('')
    df['content'] = df['subject'] + ' ' + df['body']
    df['content'] = df['content'].apply(clean_text)
    df = df[['label', 'content']]
    return df

def ml_preprocess_2(df):
    df['content'] = df['text'].apply(clean_text)
    df = df[['label', 'content']].dropna()
    return df
max_seq_length = 1024

def truncate_content(content, max_length=600):
    return content[:max_length] + "..." if len(content) > max_length else content

# Updated format_dataset_with_prompt function
def format_dataset_with_prompt(examples):
    prompt_template = """
[INST] You are a phishing detection classifier. Classify the email as phishing (1) or non-phishing (0).
Return ONLY the integer 1 or 0. Do not provide any explanation or additional text. Classify only phishing emails as phishing (1) and spam and non-phishing as non-phishing (0)
### Example 1:
Email Content:
"Hi how is everyone"
Your Response: 0
### Example 2:
Email Content:
"Your account has been hacked click the link below URLFOUND"
Your Response: 1
### Now classify this email:
Email Content:
"{content}"
Your Response: [/INST]
"""
    prompts = []
    responses = []
    for content, label in zip(examples['content'], examples['label']):
        prompt = prompt_template.format(content=truncate_content(content))
        response = str(label)
        prompts.append(prompt)
        responses.append(response)
    return {"prompt": prompts, "response": responses}

# Load and preprocess dataset
df = pd.read_csv("/kaggle/input/my-phishing-dataset/Nazario_5.csv")
df = ml_preprocess(df)

spam_samples = pd.read_csv("/kaggle/input/my-phishing-dataset/email_text.csv")

# Split the dataset into samples where the label is 1 (phishing emails)
spam_samples = spam_samples[spam_samples['label'] == 1].copy()

# Change the label of these samples to 0 (as per your original code)
spam_samples['label'] = 0
spam_samples =spam_samples[:30]
spam_samples = ml_preprocess_2(spam_samples)


# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = pd.concat([train_df, spam_samples], ignore_index=True)

# train_df=train_df[:400]
train_df = pd.concat([train_df, spam_samples], ignore_index=True)


# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))

test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# Apply formatting to datasets
train_dataset = train_dataset.map(
    format_dataset_with_prompt,
    batched=True,
    remove_columns=train_dataset.column_names
)
test_dataset = test_dataset.map(
    format_dataset_with_prompt,
    batched=True,
    remove_columns=test_dataset.column_names
)

# Define custom data collator
class DataCollatorForSupervisedDataset:
    def __init__(self, tokenizer, max_seq_length=1024):
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __call__(self, instances):
        input_ids_list = []
        labels_list = []

        for instance in instances:
            prompt = instance['prompt']
            response = instance['response']

            # Tokenize prompt and response separately
            prompt_ids = self.tokenizer.encode(prompt, add_special_tokens=False)
            response_ids = self.tokenizer.encode(response, add_special_tokens=False)

            # Concatenate prompt and response
            input_ids = prompt_ids + response_ids + [self.tokenizer.eos_token_id]

            # Create labels: -100 for prompt tokens, actual ids for response tokens
            labels = [-100] * len(prompt_ids) + response_ids + [self.tokenizer.eos_token_id]

            # Truncate if necessary
            if len(input_ids) > self.max_seq_length:
                input_ids = input_ids[:self.max_seq_length]
                labels = labels[:self.max_seq_length]

            input_ids_list.append(torch.tensor(input_ids))
            labels_list.append(torch.tensor(labels))

        # Pad sequences
        input_ids_padded = torch.nn.utils.rnn.pad_sequence(
            input_ids_list,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id
        )
        labels_padded = torch.nn.utils.rnn.pad_sequence(
            labels_list,
            batch_first=True,
            padding_value=-100
        )
        attention_mask = (input_ids_padded != self.tokenizer.pad_token_id).long()

        return {
            "input_ids": input_ids_padded,
            "labels": labels_padded,
            "attention_mask": attention_mask
        }


Map:   0%|          | 0/2512 [00:00<?, ? examples/s]

Map:   0%|          | 0/613 [00:00<?, ? examples/s]

In [None]:
# import os
# import glob

# # Specify the directory from which you want to delete all files
# directory = '/kaggle/working/'

# # Get a list of all files in the directory
# files = glob.glob(directory + '*')

# # Loop through and delete each file
# for file in files:
#     try:
#         if os.path.isfile(file):
#             os.remove(file)
#             print(f'File {file} has been deleted.')
#     except Exception as e:
#         print(f'Error deleting file {file}: {e}')


# Training

In [None]:

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "ffn", "gate_proj"],
)

model = get_peft_model(model, peft_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    save_steps=20,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    save_strategy="steps",
    remove_unused_columns=False,
)

# Define data collator
data_collator = DataCollatorForSupervisedDataset(
    tokenizer=tokenizer,
    max_seq_length=max_seq_length
)

# Create SFT Trainer with the custom data collator
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_text_field="prompt"
)

# Train the model
trainer.train()


# Save the Model

In [None]:


# Merge and save the fine-tuned model
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./mistral_finetuned_merged")
tokenizer.save_pretrained("./mistral_finetuned_merged")


# Load Saved Model if needed

In [4]:



model = AutoModelForCausalLM.from_pretrained(
    "./mistral_finetuned_merged",
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

model.gradient_checkpointing_enable()  # Enable gradient checkpointing
model.eval()

tokenizer = AutoTokenizer.from_pretrained("./mistral_finetuned_merged")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.pad_token_id = model.config.eos_token_id


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


# Sample Test Email

In [5]:
model.eval()

# Update the standardize_output function
def standardize_output(result):
    result = result.strip()
    if '1' in result:
        return 1
    elif '0' in result:
        return 0
    else:
        return -1  # Invalid output

# Test the fine-tuned model with a sample email
test_email = (
    "This is ROhan Sood"
)

# Prepare the prompt
prompt_template = """
[INST] You are a phishing detection classifier. Classify the email as phishing (1) or non-phishing (0). Classify only phishing emails as phishing (1) and spam and non-phishing as non-phishing (0)
Return ONLY the integer 1 or 0. Do not provide any explanation or additional text.
### Example 1:
Email Content:
"Hi how is everyone"
Your Response: 0
### Example 2:
Email Content:
"Your account has been hacked click the link below URLFOUND"
Your Response: 1
### Now classify this email:
Email Content:
"{content}"
Your Response: [/INST]
"""
prompt = prompt_template.format(content=test_email)

# Tokenize the prompt
sample_input = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    max_length=max_seq_length
).to("cuda")

# Generate the output
with torch.no_grad():
    sample_output = model.generate(
        **sample_input,
        max_new_tokens=2,  # Limit the output length
        do_sample=False,
        num_beams=1,
        use_cache=True,
        pad_token_id=tokenizer.eos_token_id
    )

# Decode the output
sample_prediction = tokenizer.decode(
    sample_output[0, sample_input['input_ids'].shape[1]:],
    skip_special_tokens=True
)
print(f"Sample email classification: {sample_prediction}")


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Sample email classification: 1


# Evaluate the model on the test set

In [6]:
predictions = []
unstandardized_predictions = []
batch_size = 16

for i in tqdm(range(0, len(test_dataset), batch_size), desc="Generating predictions", unit="batch"):
    batch = test_dataset[i:i + batch_size]
    batch_predictions = []
    batch_unstandardized = []
    for idx, sample in enumerate(batch['prompt']):
        sample_input = tokenizer(
            sample,
            return_tensors="pt",
            truncation=True,
            max_length=max_seq_length
        ).to("cuda")
        with torch.no_grad():
            sample_output = model.generate(
                **sample_input,
                max_new_tokens=2,
                do_sample=False,
                num_beams=1,
                use_cache=True,
                pad_token_id=tokenizer.eos_token_id
            )
        # Decode only the newly generated tokens
        sample_prediction = tokenizer.decode(
            sample_output[0, sample_input['input_ids'].shape[1]:],
            skip_special_tokens=True
        )
        batch_unstandardized.append(sample_prediction)
        batch_predictions.append(standardize_output(sample_prediction))
    unstandardized_predictions.extend(batch_unstandardized)
    predictions.extend(batch_predictions)
#     torch.cuda.empty_cache()

# print("Standardized predictions:", predictions)
print("Unstandardized predictions:", unstandardized_predictions)

# Evaluate predictions
valid_predictions = [pred for pred in predictions if pred != -1]
y_true = test_df['label'].tolist()[:len(valid_predictions)]

# Print classification report
print("Classification Report:")
print(classification_report(y_true, valid_predictions, digits=4))

Generating predictions: 100%|██████████| 39/39 [05:58<00:00,  9.20s/batch]

Unstandardized predictions: ['0', '0', '0', '1', '1', '0', '1', '1', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '1', '1', '1', '1', '1', '0', '1', '1', '0', '1', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '0', '0', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '1', '0', '0', '1', '0', '0', '0', '1', '1', '0', '1', '1', '0', '1', '0', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0', '1', '1', '1', '0', '0', '1', '0', '0', '1', '0', '0', '1', '0', '0', '0', '1', '1', '1', '0', '0', '0', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '1', '1', '0', '0', '0', '1', '




# Stage 2 Test model on TREC_07 dataset

In [7]:
# Import necessary libraries
import os
import pandas as pd
import re
import string
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from sklearn.metrics import classification_report



# Load Model 

In [None]:


# # Load the trained model and tokenizer
# model_path = "./mistral_finetuned_merged"

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(
#     model_path,
#     trust_remote_code=True
# )
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = 'right'



# model = AutoModelForCausalLM.from_pretrained(
#     "./mistral_finetuned_merged",
#     device_map="auto",
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True,
# )


# model.eval()  # Set model to evaluation mode

In [11]:
# import torch
# torch.cuda.empty_cache()


# Inference

In [12]:

def ml_preprocess(df):
    df['content'] = df['text'].apply(clean_text)
    df = df[['label', 'content']].dropna()
    return df

# Load the dataset (update with the actual path)
df = pd.read_csv("/kaggle/input/my-phishing-dataset/email_text.csv")

# Preprocess the dataset
df = ml_preprocess(df)

# Split the dataset into samples where the label is 1 (phishing emails)
df_label_1 = df[df['label'] == 1].copy()

# Change the label of these samples to 0 (as per your original code)
df_label_1['label'] = 0

df_label_1=df_label_1[30:]
# Define the data for testing
X_test = df_label_1['content'].tolist()
y_test = df_label_1['label'].tolist()

# Define the prompt template and helper functions
prompt_template = """
[INST] You are a phishing detection classifier. Classify the email as phishing (1) or non-phishing (0). Classify only phishing emails as phishing (1) and spam and non-phishing as non-phishing (0)
Return ONLY the integer 1 or 0. Do not provide any explanation or additional text.
### Example 1:
Email Content:
"hey billy while we were out you said that you felt insecure about your manhood i noticed in the toilets you were quite small in that area but not to worry that website that i was telling you about is my secret weapon to an extra escapenumber inches trust me girls love bigger ones i've had escapenumber times as many chicks since i used these pills a year ago the package i used was the escapenumber month supply one and its worth every cent and more the website is http ctmay com ring me on the weekend and we will go out and drink again and let you know some more secrets later dude brad"
Your Response: 0
### Example 2:
Email Content:
"Your account has been hacked click the link below URLFOUND"
Your Response: 1
### Now classify this email:
Email Content:
"{content}"
Your Response: [/INST]
"""

def truncate_content(content, max_length=600):
    return content[:max_length] + "..." if len(content) > max_length else content

def prepare_prompt(content):
    return prompt_template.format(content=truncate_content(content))

def standardize_output(result):
    result = result.strip()
    if '1' in result:
        return 1
    elif '0' in result:
        return 0
    else:
        return -1  # Invalid output

# Parameters
batch_size = 32
max_seq_length = 1024

# Initialize lists to store predictions
predictions = []
unstandardized_predictions = []

# Process the test data in batches
print("Starting inference on the test dataset...")
for i in tqdm(range(0, len(X_test), batch_size), desc="Predicting", unit="batch"):
    batch_contents = X_test[i:i + batch_size]
    batch_prompts = [prepare_prompt(content) for content in batch_contents]
    
    # Tokenize the prompts
    inputs = tokenizer(
        batch_prompts,
        return_tensors="pt",
        truncation=True,
        max_length=max_seq_length,
        padding=True
    ).to("cuda")
    
    # Generate outputs
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,  # Limit the output length
            do_sample=False,
            num_beams=1,
            use_cache=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode outputs and standardize predictions
    for idx in range(len(batch_contents)):
        output = outputs[idx]
        input_length = inputs['input_ids'].shape[1]
        generated_tokens = output[input_length:]
        sample_prediction = tokenizer.decode(
            generated_tokens,
            skip_special_tokens=True
        )
        unstandardized_predictions.append(sample_prediction)
        predictions.append(standardize_output(sample_prediction))

# Handle invalid predictions
valid_indices = [i for i, pred in enumerate(predictions) if pred != -1]
valid_predictions = [predictions[i] for i in valid_indices]
valid_y_true = [y_test[i] for i in valid_indices]

# Print classification report
print("Classification Report:")
print(classification_report(valid_y_true, valid_predictions, digits=4))


Starting inference on the test dataset...


Predicting:   0%|          | 0/935 [00:00<?, ?batch/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Predicting:   0%|          | 1/935 [00:25<6:32:57, 25.24s/batch]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Predicting:   0%|          | 2/935 [00:47<6:01:41, 23.26s/batch]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Predicting:   0%|          | 3/935 [01:09<5:57:48, 23.03s/batch]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Predicting:   0%|          | 4/935 [01:22<4:50:50, 18.74s/batc

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    0.9991    0.9995     29637
           1     0.0000    0.0000    0.0000         0

    accuracy                         0.9991     29637
   macro avg     0.5000    0.4995    0.4998     29637
weighted avg     1.0000    0.9991    0.9995     29637




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Push to Huggingface

In [None]:
from huggingface_hub import notebook_login


# Log in to your Hugging Face account
notebook_login()



In [None]:
# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(
    "./mistral_finetuned_merged",
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("./mistral_finetuned_merged")

In [None]:
repo_name = "Rsood/mistral-instruct-v2-phishing-detection-v2"  # Replace with your username and desired repo name
# Push the model to Hugging Face Hub
model.push_to_hub(repo_name)

# Push the tokenizer to Hugging Face Hub
tokenizer.push_to_hub(repo_name)
