# Text Classification - Inference LLM (off-the-shelf model)


## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Data - Load dataset
* Model
* Inference

## $\color{blue}{Preamble:}$

Here we use the Mistral Instruct 7B model for inference.

## $\color{blue}{Admin:}$

In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'

Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install dill

In [None]:
import dill
def save_data(docs, filename):
    """Save a list of Langchain Documents to a .dill file."""
    with open(filename, 'wb') as f:
        dill.dump(docs, f)
    print(f"Documents saved to {filename}")

def load_data(filename):
    """Load a list of Langchain Documents from a .dill file."""
    with open(filename, 'rb') as f:
        docs = dill.load(f)
    print(f"Documents loaded from {filename}")
    return docs

## $\color{blue}{Data:}$

In [None]:
%%capture
!pip install datasets

In [None]:
path = "class/datasets/"
trainDataset = load_data(path + "Dataset_train")
devDataset = load_data(path + "Dataset_dev")
testDataset = load_data(path + "Dataset_test")

Documents loaded from class/datasets/Dataset_train
Documents loaded from class/datasets/Dataset_dev
Documents loaded from class/datasets/Dataset_test


## $\color{blue}{Model:}$

In [None]:
!pip install -qU bitsandbytes accelerate loralib transformers peft

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
import os
from getpass import getpass
from huggingface_hub import login

# Prompt for your Hugging Face token securely
token = getpass("Please enter your Hugging Face token: ")

Please enter your Hugging Face token: ··········


In [None]:
# Use the token for Hugging Face login
if token:
    print("HuggingFace token has been successfully entered.")
    login(token=token)
else:
    print("Continuing without Hugging Face login")

HuggingFace token has been successfully entered.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## $\color{blue}{Inference:}$

In [None]:

def generate_prompt(example, return_response=True):
  full_prompt =  """[INST]Read the Text, choose the correct classification from the list below. Give no explanation. Provide a single word response from the list.
YOU MUST ANSWER WITH ONE WORD.

Telemachia
Odyssey
Nostros
Dubliners
Dracula
Republic

###Input:

Text: """
  full_prompt += f"{example['input']}[/INST]"
  full_prompt += "\nAnswer: "
  if return_response:
    full_prompt += f"{example['output']}"

  return [full_prompt]

In [None]:
import re
def get_response(example):
  prompt = generate_prompt(example, return_response=False)[0]
  tokens = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
  ids = tokens["input_ids"].cuda()
  ams = tokens["attention_mask"]
  outputs = base_model.generate(input_ids=ids, attention_mask=ams, max_new_tokens=4, do_sample=False, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
  decoded = tokenizer.batch_decode(outputs.detach().cpu().numpy())[0]
  return decoded

In [None]:
import pandas as pd
path = "class/datasets/" # modify path
df_train = pd.read_pickle(path + "df_train")
df_dev = pd.read_pickle(path + "df_dev")
df_test = pd.read_pickle(path + "df_test")

In [None]:
df_dev.columns

Index(['index', 'master', 'book_idx', 'book', 'chapter_idx', 'chapter',
       'author', 'content', 'vanilla_embedding', 'vanilla_preds',
       'vanilla_pseudo_book', 'vanilla_moe_e2e_soft_preds',
       'vanilla_moe_e2e_soft_pseudo_book', 'vanilla_moe_e2e_hard_preds',
       'vanilla_moe_e2e_hard_pseudo_book', 'vanilla_moe_e2e_soft_forest_preds',
       'vanilla_moe_e2e_soft_forest_pseudo_book', 'vanilla_moe_hard_pre_preds',
       'vanilla_moe_hard_pre_pseudo_book', 'vanilla_embedding.1',
       'direct_ft_preds', 'direct_ft_pseudo_book', 'ft_embedding',
       'embedding_ft_preds', 'embedding_ft_pseudo_book', 'direct_ft_moe_preds',
       'direct_ft_moe_pseudo_book', 'ft_embedding_pal', 'mistral_ots_book',
       'mistral_ft_book'],
      dtype='object')

In [None]:
conv = {
    None: "unknown",
    "telemachia": 0,
    "odyssey":1,
    "od":1,
    "nostros":2,
    "nost":2,
    "dubliners":3,
    "dub":3,
    "dracula":4,
    "drac":4,
    "republic":5,
}

In [None]:
from tqdm import tqdm

outs = []

for i in tqdm(range(len(devDataset))):
  outs.append(get_response(devDataset[i]))


100%|██████████| 964/964 [09:13<00:00,  1.74it/s]


In [None]:
def parse(text):
  # Match 'Answer:' followed by whitespace and then capture the next word
  pattern = r'Answer:\s+(\w+)'  # \w+ will match word characters (equivalent to [a-zA-Z0-9_])
  m = re.search(pattern, text)
  if m:
      return m.group(1).lower()  # Return the matched word in lowercase
  else:
      return "unknown"

outs_clean = [parse(out) for out in outs]

In [None]:
outs_clean[0:5]

['nostros', 'telemachia', 'odyssey', 'nostros', 'nostros']

In [None]:
def transform_outs(response):
  if response in ["odyssey", "nostros", "dubliners", "telemachia", "dracula","republic", "drac", "od", "nost", "dub"]:
    return conv[response]
  else:
    return -1

outs_conv = [transform_outs(out) for out in outs_clean]

In [None]:
outs[1]

'<s> [INST]Read the Text, choose the correct classification from the list below. Give no explanation. Provide a single word response from the list.\nYOU MUST ANSWER WITH ONE WORD.\n\nTelemachia\nOdyssey\nNostros\nDubliners\nDracula\nRepublic\n\n###Input:\n\nText: The bride who was given away by her father, the M’Conifer of the Glands, looked exquisitely charming in a creation carried out in green mercerised silk, moulded on an underslip of gloaming grey, sashed with a yoke of broad emerald and finished with a triple flounce of darkerhued fringe,[/INST]\nAnswer:  Telemachia'

In [None]:
parse(outs[1])

'telemachia'

In [None]:
df_dev["mistral_ots_book"] = outs_conv

In [None]:
df_dev['mistral_ots_book'].value_counts()

Unnamed: 0_level_0,count
mistral_ots_book,Unnamed: 1_level_1
1,319
2,210
3,187
0,86
4,82
-1,50
5,30


In [None]:
(df_dev['mistral_ots_book'] == df_dev["book_idx"]).sum()/ df_dev.shape[0]

0.3132780082987552

In [None]:
path = "class/datasets/" # modify path
df_dev.to_pickle(path + "df_dev")
