### Reference: [Fine-Tuning Llama 3.1 for Text Classification](https://www.datacamp.com/tutorial/fine-tuning-llama-3-1)

In [None]:
%pip install bitsandbytes
%pip install peft
%pip install trl

In [2]:
from huggingface_hub import login

# Replace with your actual Hugging Face token
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import numpy as np
import pandas as pd
import os
import wandb
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [4]:
# Load the Excel file
df = pd.read_excel('/kaggle/input/news-less-clean/news_less_clean.xlsx').iloc[:, :-2]

# Map labels to numerical values
label_mapping = {
    1: 'Vessel Delay',
    2: 'Vessel Accidents',
    3: 'Maritime Piracy or Terrorism risk',
    4: 'Port or Important Route Congestion',
    5: 'Port Criminal Activities',
    6: 'Cargo Damage and Loss',
    7: 'Inland Transportation Risks',
    8: 'Environmental Impact and Pollution',
    9: 'Natural Extreme Events and Extreme Weather',
    10: 'Cargo or Ship Detainment',
    11: 'Unstable Regulatory and Political Environment',
    12: 'Maritime-related but not covered by existing categories',
    13: 'Non-maritime-related'
}

# Apply the mapping to the dataset
df['Category'] = df['LABEL'].map(label_mapping)

# Inspect the data
print(df.head())

               Date                                                URL  \
0  20240815T010000Z  https://borneobulletin.com.bn/explosions-repor...   
1  20240716T194500Z  https://www.hindustantimes.com/india-news/crew...   
2  20240809T100000Z  https://www.yahoo.com/news/multiple-attacks-ta...   
3  20240717T041500Z  https://timesofoman.com/article/147862-oil-tan...   
4  20240812T201500Z  https://menafn.com/1108546043/Multiple-Attacks...   

                                               Title                 Source  \
0  Explosions reported near two ships off Yemen :...  borneobulletin.com.bn   
1  Crew , including 13 Indians , still missing af...     hindustantimes.com   
2  Multiple attacks target merchant ship off Yeme...              yahoo.com   
3  Oil tanker with 13 Indians on board sinks off ...        timesofoman.com   
4    Multiple Attacks Target Merchant Ship Off Yemen             menafn.com   

         Country  LABEL                           Category  
0         Brunei   

In [5]:
base_model_name = "meta-llama/Llama-3.2-1B"

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=False,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="float16",
# )

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype="float16",
    # quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

tokenizer.pad_token_id = tokenizer.eos_token_id

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [8]:
pipe = pipeline(
    task="text-generation", 
    model=model, 
    tokenizer=tokenizer, 
    max_new_tokens=500, 
    top_p=0.7,
    temperature=0.7,
    repetition_penalty=1.2,
    return_full_text=False
)

prompt = """
Headline: Singapore-flagged tanker involved in collision near Pedra Branca: Ship operator.
Article Link: https://www.straitstimes.com/singapore/light-oil-sheens-reported-near-s-pore-flagged-tanker-involved-in-collision-off-pedra-branca-mpa
Recommend general mitigation strategies specifically for the above incident in the Singapore context.
"""

result = pipe(prompt)

print(result[0]['generated_text'])

1.	General Mitigation Strategies:
i)	Avoid excessive speed and manoeuvring, especially at night or when visibility is limited;
ii)	Pass through narrow channels with caution; ensure that all vessels are able to safely pass through these areas without any adverse impact on navigation safety;
iii)	Use appropriate navigational aids such as lights (including radar), charts, and electronic aids-to-navigation (A2N);
iv)	Carry out proper ship handling procedures including maintaining a safe distance from other ships;
v)	Monitor vessel traffic closely by using AIS systems where available;
vi)	Report incidents immediately upon discovery of an accident involving another vessel;
vii)	Raise alarms promptly if there is fire aboard your own vessel so it can be extinguished before it spreads to nearby vessels;
viii)	Ensure crew members know their roles and responsibilities during emergencies;
ix)	Use good seamanship practices which include being aware of local weather conditions and taking necessary p