In [None]:
!pip install -q sentence-transformers tqdm

In [None]:
import pandas as pd
import numpy as np
df = pd.read_csv('/content/alpaca_python_code.csv', engine='python', on_bad_lines='skip')

In [None]:
!pip install -q ollama nest_asyncio
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:

import os
import time
import nest_asyncio
from ollama import AsyncClient
import pandas as pd
import asyncio
import subprocess


nest_asyncio.apply()
MODEL_NAME = "qwen2.5-coder:1.5b"
os.system("pkill ollama")
subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
time.sleep(15)
os.system(f"ollama pull {MODEL_NAME}")
BATCH_SIZE = 5
async def classify_row(client, idx, instruction, code):
    prompt = f"""
Classify into ONE Broad Category: [array,graph,tree,sorting,searching,hashmap,dynamic programming,string,Web Dev,Machine Learning]
Task: {str(instruction)[:400]}
Code: {str(code)[:300]}
Return ONLY:
Category
"""
    try:
        response = await client.generate(model=MODEL_NAME, prompt=prompt, options={"temperature": 0})
        text = response['response'].strip().split("\n")[0]
        # Live feedback
        print(f"\nRow {idx+1}: Predicted -> {text}")
        return idx, text
    except Exception as e:
        print(f"\nRow {idx+1}: ERROR -> {e}")
        return idx, f"Error | {str(e)[:50]}"

async def run_pipeline(df):
    client = AsyncClient()
    tasks = [classify_row(client, idx, row['instruction'], row['output']) for idx, row in df.iterrows()]

    results = []
    for i in range(0, len(tasks), BATCH_SIZE):
        batch = tasks[i:i+BATCH_SIZE]
        batch_results = await asyncio.gather(*batch)
        results.extend(batch_results)
        print(f"Processed {min(i+BATCH_SIZE, len(df))}/{len(df)} rows so far")
    return results


print("Starting classification...\n")
final_results = await run_pipeline(df)
results_dict = dict(final_results)
df["classification_raw"] = df.index.map(results_dict)
df[["category", "topic"]] = df["classification_raw"].str.split("|", n=1, expand=True)
df["category"] = df["category"].str.strip()
df["topic"] = df["topic"].str.strip()
print("\nClassification Complete!")
print("\nCategory counts:")
print(df["category"].value_counts())
print("\n--- Sample Predictions ---")
for i in range(min(5, len(df))):
    print(f"\nInstruction: {df.loc[i, 'instruction']}")
    print(f"Code: {df.loc[i, 'output'][:200]}...")
    print(f"Predicted: {df.loc[i, 'classification_raw']}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed 16340/18612 rows so far

Row 16342: Predicted -> Graph

Row 16341: Predicted -> Tree

Row 16345: Predicted -> Machine Learning

Row 16343: Predicted -> Sorting

Row 16344: Predicted -> Sorting
Processed 16345/18612 rows so far

Row 16346: Predicted -> string

Row 16349: Predicted -> Machine Learning

Row 16350: Predicted -> Task

Row 16348: Predicted -> Data Structure

Row 16347: Predicted -> Sorting
Processed 16350/18612 rows so far

Row 16352: Predicted -> Task Management

Row 16353: Predicted -> Array

Row 16355: Predicted -> Sorting

Row 16351: Predicted -> Sorting

Row 16354: Predicted -> Graph
Processed 16355/18612 rows so far

Row 16360: Predicted -> Array

Row 16358: Predicted -> Tree

Row 16359: Predicted -> string

Row 16356: Predicted -> Sorting

Row 16357: Predicted -> Book
Processed 16360/18612 rows so far

Row 16364: Predicted -> searching

Row 16362: Predicted -> Category: Clustering

Row 16363: P

ValueError: Columns must be same length as key

In [None]:
# -----------------------------
# SAVE RESULTS
# -----------------------------
results_dict = dict(final_results)

# Each row gets its category directly
df["category"] = df.index.map(results_dict)

# -----------------------------
# SUMMARY
# -----------------------------
print("\nClassification Complete!")
print("\nCategory counts:")
print(df["category"].value_counts())
df.to_csv("alpca_with_category_dataset.xlsx", index=False)

print("\n--- Sample Predictions ---")
for i in range(min(5, len(df))):
    print("\nInstruction:", df.loc[i, "instruction"])
    print("Code:", df.loc[i, "output"][:200], "...")
    print("Predicted Category:", df.loc[i, "category"])



Classification Complete!

Category counts:
category
Sorting                 5941
string                  1339
Web Dev                 1243
Machine Learning        1090
Tree                     942
                        ... 
Sports                     1
Text Preprocessing         1
Category: [sorting]        1
Performance Analysis       1
Speech                     1
Name: count, Length: 711, dtype: int64

--- Sample Predictions ---

Instruction: Create a function to calculate the sum of a sequence of integers.
Code: # Python code
def sum_sequence(sequence):
  sum = 0
  for num in sequence:
    sum += num
  return sum ...
Predicted Category: Sorting

Instruction: Generate a Python code for crawling a website for a specific type of data.
Code: import requests
import re

def crawl_website_for_phone_numbers(website):
    response = requests.get(website)
    phone_numbers = re.findall('\d{3}-\d{3}-\d{4}', response.text)
    return phone_number ...
Predicted Category: Web Dev

Instruction

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,instruction,input,output,prompt,classification_raw,category
0,0,Create a function to calculate the sum of a se...,"[1, 2, 3, 4, 5]",# Python code\ndef sum_sequence(sequence):\n ...,Below is an instruction that describes a task....,Sorting,Sorting
1,1,Generate a Python code for crawling a website ...,website: www.example.com \ndata to crawl: phon...,import requests\nimport re\n\ndef crawl_websit...,Below is an instruction that describes a task....,Web Dev,Web Dev
2,2,Create a Python list comprehension to get the ...,,"[x*x for x in [1, 2, 3, 5, 8, 13]]",Below is an instruction that describes a task....,Sorting,Sorting
3,3,Generate a python script to perform this action.,"Given a string, remove all the consecutive dup...",def remove_duplicates(string): \n result = ...,Below is an instruction that describes a task....,Sorting,Sorting
4,4,Write a python script to generates random numb...,,def generate_random_divisible_number():\n i...,Below is an instruction that describes a task....,Sorting,Sorting


In [None]:
import re

illegal = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F]")

def clean_text(text):
    if isinstance(text, str):
        return illegal.sub("", text)
    return text

df_clean = df.apply(lambda col: col.map(clean_text))
df_clean.to_excel("alpca_with_category_dataset.xlsx", index=False)
