<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/Fine_tuning_Llama_3_2_on_with_a_single_GPU_using_torchtune%7C_Training_LLM_for_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torchtune==0.3.1 --progress-bar off
!pip install -qqq torchao==0.6.1 --progress-bar off
!pip install -qqq transformers==4.46.1 --progress-bar off

In [None]:
import json
import re
from pathlib import Path
from typing import List

import matplotlib as mpl
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from google.colab import userdata
from matplotlib.ticker import PercentFormatter
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

%matplotlib inline
%config InlineBackend.figure_format='retina'

COLORS = ["#bae1ff", "#ffb3ba", "#ffdfba", "#ffffba", "#baffc9"]

sns.set(style="whitegrid", palette="muted", font_scale=1.2)
sns.set_palette(sns.color_palette(COLORS))

cmap = colors.LinearSegmentedColormap.from_list("custom_cmap", COLORS[:2])

MY_STYLE = {
    "figure.facecolor": "black",
    "axes.facecolor": "black",
    "axes.edgecolor": "white",
    "axes.labelcolor": "white",
    "axes.linewidth": 0.5,
    "text.color": "white",
    "xtick.color": "white",
    "ytick.color": "white",
    "grid.color": "gray",
    "grid.linestyle": "--",
    "grid.linewidth": 0.5,
    "axes.grid": True,
    "xtick.labelsize": "medium",
    "ytick.labelsize": "medium",
    "axes.titlesize": "large",
    "axes.labelsize": "large",
    "lines.color": COLORS[0],
    "patch.edgecolor": "white",
}

mpl.rcParams.update(MY_STYLE)

RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

In [2]:
!gdown 1UD8fN6JvbJkBOPhfwjlaIONCE45hVH7Q

Downloading...
From: https://drive.google.com/uc?id=1UD8fN6JvbJkBOPhfwjlaIONCE45hVH7Q
To: /content/mental-health-sentiment.csv
100% 31.5M/31.5M [00:00<00:00, 154MB/s]


In [None]:
df = pd.read_csv("mental-health-sentiment.csv")

In [3]:
df.head()

In [None]:
df["word_count"] = df.statement.apply(lambda x: len(x.split(" ")))

In [None]:
plt.hist(
    df.word_count, weights=np.ones(len(df.word_count)) / len(df.word_count), bins=30
)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xlabel("Words")
plt.ylabel("Percentage")
plt.ylim(0, 1)
plt.show()

In [None]:
df = df[["statement", "status", "word_count"]]

In [None]:
df.shape

In [None]:
status_counts = df["status"].value_counts(normalize=True)
plt.bar(status_counts.index, status_counts.values)
plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
plt.xlabel("Status")
plt.ylabel("Percentage")
plt.ylim(0, 0.5)
plt.xticks(rotation=90)
plt.show()

In [None]:
len(df[df.word_count < 1000]) / len(df)

In [None]:
df = df[df.word_count < 1000]
df.shape

In [None]:
max_samples = 5000
df_sampled = df.groupby("status")[["statement", "status"]].apply(
    lambda x: x.sample(n=min(len(x), max_samples))
)
df_sampled = df_sampled.reset_index(drop=True)
df_sampled.head()

In [None]:
df_sampled.shape

In [None]:
train_df, test_df = train_test_split(
    df_sampled, test_size=0.2, random_state=RANDOM_SEED
)
train_df.shape, test_df.shape

In [None]:
class_names = [
    "normal",
    "depression",
    "suicidal",
    "anxiety",
    "bipolar",
    "stress",
    "personality disorder",
]


In [None]:
def create_prompt(statement: str, class_names: List[str]):
    prompt = """
Classify the text for one of the categories:



Choose from one of the category:
{classes}
Only choose one category, the most appropriate one. Reply only with the category.
""".strip()
    return prompt.format(text=statement, classes=", ".join(class_names))


In [None]:
def create_dataset(df):
    rows = []
    for _, row in tqdm(df.iterrows()):
        rows.append(
            {
                "input": create_prompt(row.statement, class_names),
                "output": row.status.lower(),
            }
        )
    return rows

In [None]:
train_rows = create_dataset(train_df)
test_rows = create_dataset(test_df)

In [None]:
Path("train_data.json").write_text(json.dumps(train_rows))
Path("test_data.json").write_text(json.dumps(test_rows))

In [None]:
hf_token = userdata.get("HF_TOKEN")

In [None]:
!tune download "meta-llama/Llama-3.2-1B-Instruct" \
  --output-dir "./Llama-3.2-1B-Instruct" \
  --hf-token "{hf_token}" \
  --ignore-patterns "[]"

In [None]:
model_id = "./Llama-3.2-1B-Instruct"
generator = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

In [None]:
%%time
predictions = []
true_values = []
for row in tqdm(test_rows):
    messages = [{"role": "user", "content": create_prompt(row["input"], class_names)}]
    outputs = generator(
        messages, max_new_tokens=32, pad_token_id=generator.tokenizer.eos_token_id
    )
    predictions.append(outputs[0]["generated_text"][-1]["content"].lower())
    true_values.append(row["output"])


In [None]:
regex = r"^\W+|\W+$"
predictions = [re.sub(regex, "", p) for p in predictions]

In [None]:
len(true_values), len(predictions)

In [None]:
pd.Series(predictions).value_counts()

In [None]:
accuracy_score(true_values, predictions)

In [None]:
eval_df = pd.DataFrame.from_dict({"label": true_values, "prediction": predictions})
eval_df.head()

In [None]:
len(eval_df[~eval_df.prediction.isin(class_names)])

In [None]:
eval_df = eval_df[eval_df.prediction.isin(class_names)]

In [None]:
print(classification_report(eval_df.label, eval_df.prediction))

#Fine tuning

In [None]:
config = """
# Model Arguments
model:
  _component_: torchtune.models.llama3_2.lora_llama3_2_1b
  lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
  apply_lora_to_mlp: True
  apply_lora_to_output: False
  lora_rank: 64
  lora_alpha: 128
  lora_dropout: 0.0

# Tokenizer
tokenizer:
  _component_: torchtune.models.llama3.llama3_tokenizer
  path: ./Llama-3.2-1B-Instruct/original/tokenizer.model
  max_seq_len: null

checkpointer:
  _component_: torchtune.training.FullModelHFCheckpointer
  checkpoint_dir: ./Llama-3.2-1B-Instruct
  checkpoint_files: [
    model.safetensors
  ]
  recipe_checkpoint: null
  output_dir: ./checkpoints
  model_type: LLAMA3_2
resume_from_checkpoint: False
save_adapter_weights_only: False

# Dataset and Sampler
dataset:
  _component_: torchtune.datasets.instruct_dataset
  data_files: ./train_data.json
  source: json
  split: train
seed: 42
shuffle: True
# batch_size: 1
batch_size: 4

# Optimizer and Scheduler
optimizer:
  _component_: torch.optim.AdamW
  fused: True
  weight_decay: 0.01
  lr: 3e-4
lr_scheduler:
  _component_: torchtune.modules.get_cosine_schedule_with_warmup
  num_warmup_steps: 100

loss:
  _component_: torchtune.modules.loss.CEWithChunkedOutputLoss

# Training
epochs: 1
max_steps_per_epoch: null
gradient_accumulation_steps: 4
compile: False # set it to True for better memory and performance
# compile: True # set it to True for better memory and performance

# Logging
output_dir: ./logs
metric_logger:
  _component_: torchtune.training.metric_logging.TensorBoardLogger
  log_dir: {output_dir} log_every_n_steps: 1 log_peak_memory_stats: False # Environment device: cuda dtype: bf16 # Activations Memory enable_activation_checkpointing: False enable_activation_offloading: False # Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler enabled: False #Output directory of trace artifacts output_dir:
{output_dir}/profiling_outputs

  #`torch.profiler.ProfilerActivity` types to trace
  cpu: True
  cuda: True

  #trace options passed to `torch.profiler.profile`
  profile_memory: False
  with_stack: False
  record_shapes: True
  with_flops: False

  # `torch.profiler.schedule` options:
  # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
  wait_steps: 5
  warmup_steps: 5
  active_steps: 2
  num_cycles: 1
"""

In [None]:
Path("custom_config.yaml").write_text(config)

In [None]:
!mkdir checkpoints

In [None]:
!tune run lora_finetune_single_device --config "custom_config.yaml" epochs=1

In [None]:
%load_ext tensorboard
%tensorboard --logdir "logs"

#upload the model

In [None]:
!mkdir hf_repo

In [None]:
!cp "checkpoints/hf_model_0001_0.pt" "hf_repo/pytorch_model.bin"
!cp "checkpoints/config.json" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/original/tokenizer.model" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/generation_config.json" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/tokenizer.json" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/tokenizer_config.json" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/special_tokens_map.json" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/LICENSE.txt" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/README.md" "hf_repo/"
!cp "Llama-3.2-1B-Instruct/USE_POLICY.md" "hf_repo/"

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!huggingface-cli upload "curiousily/Llama-3.2-1B-Mental-Health-Sentiment" "hf_repo"

In [None]:
MODEL_PATH = "curiousily/Llama-3.2-1B-Mental-Health-Sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH).to("cuda")

#Evaluation

In [None]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=128,
    temperature=0.000001,
    return_full_text=False,
    model_kwargs={"torch_dtype": torch.bfloat16},
    pad_token_id=tokenizer.eos_token_id,
    device="cuda",
)

In [None]:
test_data = json.loads(Path("test_data.json").read_text())

In [None]:
prompt = test_data[4]["input"]
print(prompt)

In [None]:
predictions = []
for example in tqdm(test_data):
    statement = example["input"]
    prompt = [
        {"role": "user", "content": statement},
    ]
    output = pipe(prompt)
    predictions.append(output[0]["generated_text"])

In [None]:
pd.Series(predictions).value_counts()

In [None]:
eval_df = pd.DataFrame.from_dict({"label": true_values, "prediction": predictions})
eval_df.head()

In [None]:
accuracy_score(true_values, predictions)

In [None]:
print(classification_report(eval_df.label, eval_df.prediction))