# Installation


This code installs essential libraries for transformer models, dataset handling, model fine-tuning, optimization, and efficient data processing in the Google Colab environment.

In [None]:
!pip install transformers datasets peft
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install accelerate
!pip install trl
!pip install optuna
!pip install pyarrow


Looking in indexes: https://pypi.org/simple/
Collecting bitsandbytes
  Using cached bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Using cached bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.3
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.6-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.9.6-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.8.6-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.8/103.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
I

# Huggingface Login

This code logs you into your Hugging Face account, enabling seamless access to models, datasets, and other resources directly from the Hugging Face Hub in the Google Colab environment.

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/usr/lib/python3.10/getpass.py", line 77, in unix_getpass
    passwd = _raw_input(prompt, stream, input=input)
  File "/usr/lib/python3.10/getpass.py", line 146, in _raw_input
    line = input.readline()
KeyboardInterrupt

During han

# Imports

This code imports a comprehensive set of libraries and modules for deep learning, model fine-tuning, and data processing in the Google Colab environment.

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, load_dataset
from peft import get_peft_model, LoraConfig, TaskType,prepare_model_for_kbit_training
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix
from google.colab import files
import torch
from transformers import BitsAndBytesConfig
from trl import SFTTrainer
import transformers
import torch.nn.functional as F
from collections import Counter
import optuna
from functools import partial
import logging
import os
from huggingface_hub import Repository,create_repo


# Upload the Data

This code allow to import the data CSV from your LOCAL to the Google Colab

In [None]:
# Upload an Excel file from your local machine
uploaded = files.upload()

Saving adapter_model.safetensors to adapter_model.safetensors
Saving tokenizer.json to tokenizer.json
Saving adapter_config.json to adapter_config.json
Saving README.md to README.md
Saving special_tokens_map.json to special_tokens_map.json
Saving tokenizer_config.json to tokenizer_config.json
Saving training_args.bin to training_args.bin


# Load Tokenizer



This code  loads a tokenizer for the specified model, Meta-Llama-3-8B-Instruct, from the Hugging Face Hub. It also customizes the tokenizer by adding special tokens:

Pad Token: Set to the end-of-sequence (eos_token) to handle padding during sequence processing.
SEP Token: Added as [SEP], which is useful for separating segments in the input.

In [None]:
#model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
model_name = 'meta-llama/Meta-Llama-3-8B-Instruct'

# Load the tokenizer and set pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,padding_side='right')
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token,'sep_token':'[SEP]'})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

# Preprocess the data and create datasets

This code contains functions that help to preprocess the data:

*   Remove the models that will be trained later from the data
*   Preprocess the row to fit the model's tokenizer
*   Create a dataset from the df and fits the columns for the classification task






In [None]:
def remove_models_from_data(df):
  # The models that will be used for inference
  models = ['mistralai/Mistral-7B-Instruct-v0.2'
          , 'meta-llama/Meta-Llama-3-8B-Instruct']
  # Filter out the models from the DataFrame
  df = df[~df['model'].isin(models)]
  return df

def preprocess_function(examples):
    inputs = [
        f"{subreddit} [SEP] {conversation} [SEP] {comment} [SEP] {reply}"
        for subreddit, conversation, comment, reply in zip(
            examples['subreddit_name'],
            examples['conversation_title'],
            examples['top_level_text'],
            examples['reply_text']
        )
    ]
    return tokenizer(inputs, truncation=True, padding=True, max_length=512)

def process_data(df):
    # Reset the index of the DataFrame
    df = df.reset_index(drop=True)

    # Remove models from data (assuming this means some kind of data cleaning)
    df = remove_models_from_data(df)

    # Modify the 'model' column: set to 1 if not 'human', otherwise set to 0
    df['model'] = df['model'].apply(lambda x: 0 if x == 'human' else 1)

    # # Remove any ! from data
    # df['reply_text'] = df['reply_text'].apply(lambda text:text.replace('!',''))

    # Rename the 'model' column to 'labels'
    df = df.rename(columns={'model': 'labels'})

    # Convert the DataFrame to a Dataset object and apply tokenization
    dataset = Dataset.from_pandas(df)
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    return tokenized_dataset

# Create the datasets from the CSV

This code creates the 3 datasets that are used for hyperparameter-search, train and evaluation.

In [None]:
train_dataset = process_data(pd.read_csv("train_by_post.csv"))
validation_dataset = process_data(pd.read_csv("val_by_post.csv"))
test_dataset = process_data(pd.read_csv("test_by_post.csv"))
train_dataset, test_dataset

# Create LORA model

This function loads a pre-trained sequence classification model with 8-bit precision, applies LoRA fine-tuning for efficient training, and configures the model with the appropriate tokenizer settings. The result is a model optimized for low-precision computation, ready for use in the Google Colab environment.

In [None]:
def load_model():
  model =  AutoModelForSequenceClassification.from_pretrained(
                model_name,
                num_labels=2,
                quantization_config=BitsAndBytesConfig(load_in_8bit=True),
                trust_remote_code=True
          )
  lora_config = LoraConfig(
  task_type=TaskType.SEQ_CLS,
  r=8,
  lora_alpha=16,
  lora_dropout=0.1,
  )
  model.resize_token_embeddings(len(tokenizer))

  model.config.pad_token_id=tokenizer.pad_token_id
  model.config.sep_token_id=tokenizer.sep_token_id

  model = prepare_model_for_kbit_training(model)
  lora_model = get_peft_model(model, lora_config)
  return lora_model


# Metrices for evaluation

This function calculates key evaluation metrics for model predictions, including accuracy, precision, recall, and F1-score, by comparing predicted labels with true labels. These metrics provide a comprehensive assessment of model performance in the Google Colab environment.

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Hyperparameter Search

This code performs a hyperparameter search by testing three predefined combinations of learning rates and weight decay values over two training epochs. For each trial, it trains a model, evaluates its performance, and saves the trained model along with the results. The validation accuracy and corresponding hyperparameters are logged to a text file and printed to the console, with all models and logs saved in designated directories within the Google Colab environment.

In [None]:
# Define the three hyperparameter combinations
hyperparameter_combinations = [
    {"learning_rate": 0.000135, "weight_decay": 0.000016},
    {"learning_rate": 0.000274, "weight_decay": 0.000528},
    {"learning_rate": 0.000080, "weight_decay": 0.000080},
]

# Open the text file for writing
with open('hyperparameter_search_results.txt', 'w') as f:

    # Loop through the predefined hyperparameter combinations
    for i, params in enumerate(hyperparameter_combinations):
        learning_rate = params["learning_rate"]
        weight_decay = params["weight_decay"]
        num_train_epochs = 2  # Fixed to 2 epochs

        # Update TrainingArguments with the current hyperparameters
        training_args = TrainingArguments(
            output_dir=f'./results/trial_{i+1}',
            evaluation_strategy='epoch',
            learning_rate=learning_rate,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            logging_dir=f'./logs/trial_{i+1}',
            fp16=True,
        )

        # Initialize a new model instance for each trial
        model = load_model()
        # Create a new Trainer instance with updated arguments
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset= train_dataset,
            eval_dataset= validation_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        # Training
        trainer.train()

        # Evaluation
        metrics = trainer.evaluate()
        # Ensure the directory for saving models exists
        save_dir = f'./saved_models/trial_{i+1}'
        os.makedirs(save_dir, exist_ok=True)

        # Save the trained model
        trainer.save_model(f'./saved_models/trial_{i+1}')
        # Print hyperparameters and their corresponding metric (accuracy)
        print(f"Trial {i+1}:")
        print(f"  Learning Rate = {learning_rate}, Weight Decay = {weight_decay}")
        print(f"  Validation Accuracy = {metrics['eval_accuracy']}\n")

        # Save hyperparameters and their corresponding metric (accuracy) to the text file
        f.write(f"Trial {i+1}:\n")
        f.write(f"  Learning Rate = {learning_rate}, Weight Decay = {weight_decay}\n")
        f.write(f"  Validation Accuracy = {metrics['eval_accuracy']}\n\n")

print("Hyperparameter search completed. Results saved to 'hyperparameter_search_results.txt'. Models saved to './saved_models/'.")

# Fine-Tune the Model - Trainer

This code sets up and trains a model using a specific set of hyperparameters: a learning rate of 0.000135, weight decay of 0.000016, and two training epochs. The `TrainingArguments` are configured for batch processing, evaluation strategy, logging, and mixed-precision training. After training, the model is saved to a designated directory in the Google Colab environment, ensuring that the directory exists before saving. This process is streamlined for efficient model training and storage.

In [None]:
learning_rate = 0.000135
weight_decay = 0.000016
num_train_epochs = 2
training_args = TrainingArguments(
            output_dir=f'./results/our_{model_name}',
            evaluation_strategy='epoch',
            learning_rate=learning_rate,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            logging_dir=f'./logs/our_{model_name}',
            fp16=True,
        )
model = load_model()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()
# Ensure the directory for saving models exists
save_dir = f'./saved_models/our_{model_name}'
os.makedirs(save_dir, exist_ok=True)

# Save the trained model
trainer.save_model(save_dir)



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5136,0.153726,0.97451,1.0,0.94902,0.973843


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5136,0.153726,0.97451,1.0,0.94902,0.973843
2,0.0326,0.043345,0.990196,0.996032,0.984314,0.990138




# Evaluate

In [None]:
# Evaluate the model
evaluation = trainer.evaluate()
print(evaluation)


# Predict on the test set
predictions = trainer.predict(test_dataset)
test_df = test_dataset.to_pandas()
test_df['predictions'] = predictions.predictions.argmax(-1)

# Select only the relevant columns (excluding input_ids and attention_mask)
output_df = test_df[['subreddit_name', 'conversation_title', 'top_level_text', 'reply_text','labels', 'predictions']]

# Save the results to a CSV file
output_df.to_csv("test_predictions.csv", index=False)

# Upload model to Google Drive

This code mounts Google Drive to the Google Colab environment and copies the folder containing your trained model from the local environment to a specified location in Google Drive. It dynamically sets the source and destination paths based on the `model_name` variable, ensuring that your model is securely saved for later use. The folder is copied from `./saved_models/{model_name}` to `/content/drive/MyDrive/saved_models/our_{model_name}`.

In [None]:
from google.colab import drive
import shutil

# Mount Google Drive
drive.mount('/content/drive')

# Define the source and destination paths
source_folder = f"./saved_models/{model_name}"
destination_folder = f"/content/drive/MyDrive/saved_models/our_{model_name}"

# Copy the entire folder
shutil.copytree(source_folder, destination_folder)

print(f"Folder {source_folder} copied to {destination_folder}")


# Load saved model from local

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This function, `load_fine_tuned_model`, loads a fine-tuned sequence classification model and its corresponding tokenizer from a specified directory. It then recreates the `Trainer` object with the loaded model and tokenizer, enabling further use or evaluation. This setup facilitates easy restoration of your fine-tuned model in the Google Colab environment. The function is demonstrated with an example that loads a model from the `./Llama` directory.

In [None]:

def load_fine_tuned_model(model_name, save_dir):
    # Load the model
    model = AutoModelForSequenceClassification.from_pretrained(save_dir)

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Recreate the trainer with the loaded model and tokenizer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
    )

    return model, tokenizer, trainer

# # Example usage:
# save_dir = './Llama'
# model, tokenizer, trainer = load_fine_tuned_model(model_name, save_dir)