In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np

In [None]:
#Loading the dataset
loader = TextLoader(r"C:\AI_DEMO\AI_Projects\project3\untitled.txt")
documents = loader.load()

In [None]:
# Extract plain text
texts = [doc.page_content for doc in documents]

In [None]:
#Chunk documents
# ---------------------------
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)
chunk_texts = [chunk.page_content for chunk in chunks]

In [None]:
# Step 3: Feature extraction for Random Forest
# ---------------------------
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(chunk_texts)

In [None]:
# For demo: create dummy labels (leave types)
labels = np.random.choice(["casual", "sick", "maternity", "unpaid"], size=len(chunk_texts))

# Train/test split (optional for demo)
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)


In [None]:
# Step 4: Train Random Forest
# ---------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# ---------------------------
# Step 5: Load local LLM
# ---------------------------
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TheBloke/vicuna-7B-1.1-HF"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",                   # automatically assigns layers to devices
    offload_folder="offload",            # folder to temporarily store weights on disk
    offload_state_dict=True,             # enable offloading
    torch_dtype="auto"                   # automatically choose float16 or float32
)



In [None]:
# ---------------------------
# Step 6: Demo query
# ---------------------------
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=200
)
new_text = ["Employees can take 12 days of casual leave per year"]


In [None]:
#  Classify chunk
X_new = vectorizer.transform(new_text)
category = rf.predict(X_new)[0]


In [None]:
#  Generate summary
prompt = f"Summarize this {category} leave policy in one sentence:\n{new_text[0]}"
summary = generator(prompt)[0]["generated_text"]

print("Category:", category)
print("Summary:", summary)