In [None]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig


# -------------------------------
# CONFIG
# -------------------------------

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
device = "cuda" if torch.cuda.is_available() else "cpu"


# -------------------------------
# Quantization config for 4-bit
# -------------------------------

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16"
)

# -------------------------------
# Load tokenizer
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(model_name)


# -------------------------------
# Load model with quantization
# -------------------------------
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
  )


print("Model loaded on:", device)









The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Model loaded on: cuda


In [None]:
# -------------------------------
# FEW-SHOT PROMPT TEMPLATE
# -------------------------------
def build_prompt(instruction: str,solution_code:str) -> str:
    return f"""
You are a strict classification engine.

You will be given:
- A problem instruction
- A Python solution code

Your task:
- Identify the SINGLE most relevant data structure or algorithm
  used in the solution code
- The solution code has higher priority than the instruction

CRITICAL RULES:
- Output EXACTLY ONE category from the allowed list
- Output ONLY the category name
- Do NOT explain
- Do NOT repeat the question
- Do NOT add text before or after
- If unsure, choose the closest reasonable category
- Use i_dont_know ONLY as a last resort

Allowed categories:
optimize, array, linked_list, stack, queue, stack_queue, hashing,
graph, tree, heap, dynamic_programming, backtracking, bit_manipulation,
matrix_grid, string, two_pointers, sliding_window, sorting_searching,
regex, math, geometry, file, machine_learning,deep_learning, image_processing, fixing, web_dev, embedded_system, data_analysis, data_visualization, i_dont_know

--------------------------------
EXAMPLES
--------------------------------

Instruction:
Find the maximum sum subarray.

Solution:
Uses a running sum (Kadane's algorithm).

Category:
dynamic_programming

Instruction:
Count character frequency in a string.

Solution:
Uses a dictionary.

Category:
hashing

--------------------------------
CLASSIFY
--------------------------------

Instruction:
{instruction}

Solution:
{solution_code}

Category:

""".strip()


In [None]:

# - Makes the model clearly **distinguish the solution from the instruction**, so it won’t try to “rewrite the code” or echo the prompt.

# ---

# ### D. Updated `classify_question` for Hugging Face

# ```python
@torch.no_grad()
def classify_question(instruction: str, solution_code: str) -> str:
    prompt = build_prompt(instruction, solution_code)

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2048
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        do_sample=False,
        temperature=0.0,
        pad_token_id=tokenizer.eos_token_id
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    pred = decoded.split("Category:")[-1].strip().lower()

    return pred



In [None]:
# -------------------------------
# BATCH CLASSIFICATION
# -------------------------------
predictions = []
def classify_dataset(input_csv: str, output_csv: str,
                     question_column: str = "instruction",
                     solution_column: str = "output"):

    for idx, row in df.iterrows():
        try:
            pred = classify_question(str(row[question_column]), str(row[solution_column]))
        except Exception as e:
            print(f"Error at row {idx}: {e}")
            pred = "i_dont_know"

        predictions.append(pred)
        if idx % 10 == 0:
            print(f"Processed {idx} rows...")

    df["new_category"] = predictions

    # df.to_csv(output_csv, index=False)
    print("Saved classified dataset to:", output_csv)



In [None]:
df=pd.read_csv("prompt_category.csv")[13001:22000]
df.drop(['category'],axis=1, inplace=True)

In [None]:
print(model.hf_device_map)


{'': 0}


In [None]:
# -------------------------------
# USAGE EXAMPLE
# -------------------------------
classify_dataset("prompt_category.csv", "categorized.csv",
                 question_column="instruction",
                 solution_column="output")


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed 13010 rows...
Processed 13020 rows...
Processed 13030 rows...
Processed 13040 rows...
Processed 13050 rows...
Processed 13060 rows...
Processed 13070 rows...
Processed 13080 rows...
Processed 13090 rows...
Processed 13100 rows...
Processed 13110 rows...
Processed 13120 rows...
Processed 13130 rows...
Processed 13140 rows...
Processed 13150 rows...
Processed 13160 rows...
Processed 13170 rows...
Processed 13180 rows...
Processed 13190 rows...
Processed 13200 rows...
Processed 13210 rows...
Processed 13220 rows...
Processed 13230 rows...
Processed 13240 rows...
Processed 13250 rows...
Processed 13260 rows...
Processed 13270 rows...
Processed 13280 rows...
Processed 13290 rows...
Processed 13300 rows...
Processed 13310 rows...
Processed 13320 rows...
Processed 13330 rows...
Processed 13340 rows...
Processed 13350 rows...
Processed 13360 rows...
Processed 13370 rows...
Processed 13380 rows...
Processed 13390 rows...
Processed 13400 rows...
Processed 13410 rows...
Processed 13420 

In [None]:
df['category']=predictions
df.to_csv("13001_22000.csv",index=False)

from google.colab import files
files.download("13001_22000.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
d=pd.read_csv("13001_22000.csv")

In [None]:
df.head()

Unnamed: 0,catergory
0,fixing
1,fixing
2,embedded_system
3,sorting_searching
4,class
...,...
8957,error_handling
8961,"data_analysis, machine_learning, math"
8966,shell_scripting
8980,class\n\ninstruction:\ngiven a list of integer...
