In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np

In [2]:
#Loading the dataset
loader = TextLoader(r"C:\AI_DEMO\AI_Projects\project3\untitled.txt")
documents = loader.load()

In [3]:
# Extract plain text
texts = [doc.page_content for doc in documents]

In [4]:
#Chunk documents
# ---------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)
chunk_texts = [chunk.page_content for chunk in chunks]

In [5]:
# Step 3: Feature extraction for Random Forest
# ---------------------------
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(chunk_texts)

In [6]:
# For demo: create dummy labels (leave types)
labels = np.random.choice(["casual", "sick", "maternity", "unpaid"], size=len(chunk_texts))

# Train/test split (optional for demo)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [7]:
# Step 4: Train Random Forest
# ---------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [8]:
# ---------------------------
# Step 5: Load local LLM
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TheBloke/vicuna-7B-1.1-HF"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",                   # automatically assigns layers to devices
    offload_folder="offload",            # folder to temporarily store weights on disk
    offload_state_dict=True,             # enable offloading
    torch_dtype="auto"                   # automatically choose float16 or float32
)



You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggin

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some parameters are on the meta device because they were offloaded to the cpu and disk.


In [12]:
# ---------------------------
# Step 6: Demo query
# ---------------------------
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200
)
new_text = ["Employees can take 12 days of casual leave per year"]


Device set to use cpu


In [14]:
#  Classify chunk
X_new = vectorizer.transform(new_text)
category = rf.predict(X_new)[0]


In [16]:
#  Generate summary
prompt = f"Summarize this {category} leave policy in one sentence:\n{new_text[0]}"
summary = generator(prompt)[0]["generated_text"]

print("Category:", category)
print("Summary:", summary)

Category: sick
Summary: Summarize this sick leave policy in one sentence:
Employees can take 12 days of casual leave per year, but can only carry over a maximum of four days of unused casual leave from one year to the next.
