In [None]:
import torch
import pandas as pd
import numpy as np
import fitz
from tqdm.auto import tqdm
import random
from spacy.lang.en import English
import re
from sentence_transformers import util, SentenceTransformer
from time import perf_counter as timer
from textwrap import wrap
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM


In [None]:
pdf_path = r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\Engg_colleges.pdf"
def text_formatter(text: str) -> str:
    cleaned_txt = text.replace("\n", " ").strip()
    return cleaned_txt
def open_and_read_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc), desc = "Reading PDF"):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number, "page_char_count": len(text), "page_word_count": len(text.split(" ")), "page_sentence_count_raw": len(text.split(". ")), "page_token_count": len(text)/4, "text": text})

    return pages_and_texts
pages_and_texts = open_and_read_pdf(pdf_path)
pages_and_texts[:2]


In [None]:
df = pd.DataFrame(pages_and_texts)
df.head()

In [None]:
nlp = English()
nlp.add_pipe("sentencizer")
doc = nlp("This is s. This is d. This is sh.")

list(doc.sents)

In [None]:
for item in tqdm(pages_and_texts, desc = "Extracting Sentences"):
    doc = nlp(item["text"])
    item["sentences"] = [str(sentence) for sentence in doc.sents]
    item["page_sentence_count_spacy"] = len(item["sentences"])

In [None]:
def chunk_sentences(pages_and_texts):
    for item in tqdm(pages_and_texts, desc="Extracting Sentences"):
        doc = nlp(item["text"])
        item["sentences"] = [str(sentence) for sentence in doc.sents]
        item["sentence_chunks"] = [item["sentences"][i:i+10] for i in range(0, len(item["sentences"]), 10)]
    return pages_and_texts

In [None]:
random.sample(pages_and_texts, k = 3)

In [None]:
num_sentence_chunk_size = 10
def split_list(input_list, slice_size = num_sentence_chunk_size):
    return [input_list[i:1+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

In [None]:
for item in tqdm(pages_and_texts, desc = "Creating chunks"):
    item["sentence_chunks"] = split_list(input_list = item["sentences"])
    item["num_chunks"] = len(item["sentence_chunks"])

In [None]:
random.sample(pages_and_texts, k=1)

In [None]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2)

In [None]:
pages_and_chunks = []
for item in tqdm(pages_and_texts, desc = "Building final structure"):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        joined_sentence_chunk = " ".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r"\.([A-Z])", r".\1", joined_sentence_chunk)
        chunk_dict["chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ") if word])
       
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)


In [None]:
random.sample(pages_and_chunks, k=1)

In [None]:
df.head()

In [None]:
min_token_length = 30
chunks_df = pd.DataFrame(pages_and_chunks)
for row in chunks_df[chunks_df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["chunk"]}")
    

In [None]:
pages_and_chunks_min_token_len = chunks_df[chunks_df["chunk_token_count"]>min_token_length].to_dict(orient = "records")
pages_and_chunks_min_token_len[:2]

In [None]:
def flatten_chunks(pages_and_texts):
    pages_and_chunks = []
    for item in tqdm(pages_and_texts, desc="Building chunks"):
        for chunk in item["sentence_chunks"]:
            joined = " ".join(chunk).strip()
            if len(joined)/4 > 10:
                pages_and_chunks.append({
                    "page_number": item["page_number"],
                    "chunk": joined,
                    "embedding": None
                })
    return pages_and_chunks

In [None]:
def embed_chunks(pages_and_chunks):
    model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
    for item in tqdm(pages_and_chunks, desc="Generating embeddings"):
        item["embedding"] = model.encode(item["chunk"])
    return model, pages_and_chunks


In [None]:
def collect_preferences():
    print(" Hey there! I’m your college guide bot. I’ll help you shortlist the best-fit engineering colleges for you.\n")
    print("Let’s chat a bit so I can understand what you're looking for. Ready? Let's go!\n")

    preferences = {}

    preferences["general"] = input(" What are some things you’re looking for in a college? (e.g., branch, location, campus life, research, etc.)\n> ")

    preferences["branch"] = input("\n Do you have a preferred engineering branch or field? (If not sure, just say that!)\n> ")

    preferences["location"] = input("\n Any preferred states or cities for college?\n> ")

    preferences["college_type"] = input("\n Do you prefer a government college (like NITs/IITs) or private universities — or are you open to both?\n> ")

    preferences["campus_life"] = input("\n How important is campus life (clubs, fests, student activities) for you?\n> ")

    preferences["academics_vs_fun"] = input("\n Would you like a college that’s more academically focused, fun/social, or a mix of both?\n> ")

    preferences["budget"] = input("\n Are there any budget or fee constraints I should know about?\n> ")

    preferences["extra_notes"] = input("\n Anything else you'd like me to keep in mind? (e.g., placements, internships, foreign exchange, etc.)\n> ")

    print("\nThanks! Let me think for a moment... \n")

    
    preference_summary = (
        f"General preferences: {preferences['general']}\n"
        f"Preferred branch: {preferences['branch']}\n"
        f"Preferred location: {preferences['location']}\n"
        f"College type preference: {preferences['college_type']}\n"
        f"Campus life importance: {preferences['campus_life']}\n"
        f"Academic vs Fun balance: {preferences['academics_vs_fun']}\n"
        f"Budget/Fees: {preferences['budget']}\n"
        f"Additional notes: {preferences['extra_notes']}"
    )

    return preferences


In [None]:
def chunk_pdf(pages_and_texts, chunk_size=500):
    pages_and_chunks = []
    for page in pages_and_texts:
        text = page["text"]
        
        chunks = wrap(text, chunk_size)
        for chunk in chunks:
            pages_and_chunks.append({
                "chunk": chunk,
                "page_number": page["page_number"]
            })
    return pages_and_chunks

In [None]:
model, embedded_chunks = embed_chunks(pages_and_chunks)

print(model)
print(embedded_chunks)


In [None]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_min_token_len)
text_chunks_and_embeddings_df

In [None]:
embeddings_df_save_path = r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index = False)

In [None]:
pages_and_chunks[2]

In [None]:
def print_wrapped(text, wrap_length = 50):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
query = "Programs offered at IIT Delhi"
print(f"Query: {query}")
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print(pages_and_chunks[idx]["chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [None]:
def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)
def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))
    return dot_product / (norm_vector1 * norm_vector2)



In [None]:
def retrieve_sources(query, pages_and_chunks, model, number_to_return=5, print_time=True):
    # Embed the query (which should be a string)
    query_embedding = model.encode(query, convert_to_tensor=True)
    start_time = timer()

    # Get precomputed chunk embeddings
    embeddings = [item["embedding"] for item in pages_and_chunks]

    # Compute similarity scores
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"Retrieved in {end_time - start_time:.2f} seconds")

    # Get top results
    top_results = torch.topk(dot_scores, k=number_to_return)
    return top_results.values, top_results.indices


def print_top_results_and_scores(query: str, pages_and_chunks, model: SentenceTransformer, number_to_return: int = 5):
    scores, indices = retrieve_sources(query, pages_and_chunks, model, number_to_return=number_to_return)
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")  
        print("Text:")
        print(pages_and_chunks[idx.item()]["chunk"])  
        print(f"Page number: {pages_and_chunks[idx.item()]['page_number']}")
        print("\n" + "-"*60 + "\n")


In [None]:
query = "2+2 dual degree programs"

print_top_results_and_Scores(
    query=query, 
    embeddings=embeddings, 
    pages_and_chunks=pages_and_chunks,
    model=model)

In [None]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyDWUo-rusTBagnDnQi-GFSKwzQ1VKdV8iQ")
gemini_model = genai.GenerativeModel("gemini-2.5-pro-exp-03-25")

In [None]:
def collect_user_preferences():
    print("Hey there! I’m your college guide bot. I’ll help you shortlist the best-fit engineering colleges for you.\n")
    print("Let’s chat a bit so I can understand what you're looking for. Ready? Let's go!\n")

    preferences = {}

    preferences["general"] = input("What are some things you’re looking for in a college? (e.g., branch, location, campus life, research, etc.)\n> ")
    preferences["branch"] = input("\nDo you have a preferred engineering branch or field? (If not sure, just say that!)\n> ")
    preferences["location"] = input("\nAny preferred states or cities for college?\n> ")
    preferences["college_type"] = input("\nDo you prefer a government college (like NITs/IITs) or private universities — or are you open to both?\n> ")
    preferences["campus_life"] = input("\nHow important is campus life (clubs, fests, student activities) for you?\n> ")
    preferences["academics_vs_fun"] = input("\nWould you like a college that’s more academically focused, fun/social, or a mix of both?\n> ")
    preferences["budget"] = input("\nAre there any budget or fee constraints I should know about?\n> ")
    preferences["extra_notes"] = input("\nAnything else you'd like me to keep in mind? (e.g., placements, internships, foreign exchange, etc.)\n> ")

    gave_exam = input("\nHave you given any entrance exam yet? (yes/no)\n> ").strip().lower()
    if gave_exam == "yes":
        preferences["gave_exam"] = True
        preferences["exam"] = input("\nWhich entrance exam did you give? (e.g., JEE Main, BITSAT, etc.)\n> ")
        preferences["score"] = input("\nWhat was your score or percentile?\n> ")
    else:
        preferences["gave_exam"] = False

    print("\nThanks! Let me think for a moment... \n")
    return preferences


In [None]:
def generate_final_response_with_gemini(preferences, top_chunks):
    context = "\n\n".join([chunk["chunk"] for chunk in top_chunks])
    prompt = (
        f"You are a helpful college counselor.\n\n"
        f"Student Preferences:\n"
        f"- Preferred Branch: {preferences['branch']}\n"
        f"- Location Preference: {preferences['location']}\n"
        f"- College Type: {preferences['college_type']}\n"
        f"- Campus Life Importance: {preferences['campus_life']}\n"
        f"- Academic vs Fun Preference: {preferences['academics_vs_fun']}\n"
        f"- Budget: {preferences['budget']}\n"
        f"- General Interests: {preferences['general']}\n"
        f"- Other Notes: {preferences['extra_notes']}\n\n"
        f"Based on the official brochure below, strictly recommend up to 3 engineering colleges in India that are the best fit.\n\n"
        f"--- Brochure Snippets ---\n{context}"
    )

    response = gemini_model.generate_content(prompt)
    print("\n🎓 Final Recommendations:\n")
    print(response.text)


In [None]:
def main():
    pages = open_and_read_pdf(r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\Engg_colleges.pdf")
    pages = chunk_sentences(pages)
    pages_and_chunks = flatten_chunks(pages)
    embedding_model, pages_and_chunks = embed_chunks(pages_and_chunks)

    preferences = collect_user_preferences()
    query = (
    f"{preferences['branch']} engineering programs "
    f"in {preferences['location']} "
    f"at {preferences['college_type']} colleges "
    f"with focus on {preferences['general']} "
    f"and a budget of {preferences['budget']}"
)

    scores, indices = retrieve_sources(query, pages_and_chunks, embedding_model)
    top_chunks = [pages_and_chunks[idx.item()] for idx in indices]

    generate_final_response_with_gemini(preferences, top_chunks)

In [None]:
if __name__ == "__main__":
    main()
