In [1]:
import torch
import pandas as pd
import numpy as np
import fitz
from tqdm.auto import tqdm
import random
from spacy.lang.en import English
import re
from sentence_transformers import util, SentenceTransformer
from time import perf_counter as timer
from textwrap import wrap
import textwrap
from transformers import AutoTokenizer, AutoModelForCausalLM


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_path = r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\Engg_colleges.pdf"
def text_formatter(text: str) -> str:
    cleaned_txt = text.replace("\n", " ").strip()
    return cleaned_txt
def open_and_read_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc), desc = "Reading PDF"):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number, "page_char_count": len(text), "page_word_count": len(text.split(" ")), "page_sentence_count_raw": len(text.split(". ")), "page_token_count": len(text)/4, "text": text})

    return pages_and_texts
pages_and_texts = open_and_read_pdf(pdf_path)
pages_and_texts[:2]


Reading PDF: 470it [00:07, 63.13it/s] 


[{'page_number': 0,
  'page_char_count': 266,
  'page_word_count': 41,
  'page_sentence_count_raw': 1,
  'page_token_count': 66.5,
  'text': 'West Bengal Joint Entrance Examinations Board  West Bengal    Engineering /Technology  Date of Examination  27.04.2025 (Sunday)  Architecture  Pharmacy  Common Entrance Test for admission to UG Courses in Engineering/Technology, Pharmacy and Architecture  WBJEE-2025'},
 {'page_number': 1,
  'page_char_count': 447,
  'page_word_count': 70,
  'page_sentence_count_raw': 3,
  'page_token_count': 111.75,
  'text': 'INFORMATION BULLETIN-WBJEE-2025   2   WBJEEB  INFORMATION BULLETIN  WBJEE-2025  Common Entrance Test for admission to UG Courses in  Engineering/Technology, Pharmacy and Architecture  Candidates must go through the Information Bulletin  carefully before registration for the entrance examination.  West Bengal Joint Entrance Examinations Board  RUPANNA  DB-118, Sector-I, Salt Lake City  Kolkata 700064  Toll free No.- 1800-123-4782 (Extn. No.-

In [3]:
df = pd.DataFrame(pages_and_texts)
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,0,266,41,1,66.5,West Bengal Joint Entrance Examinations Board ...
1,1,447,70,3,111.75,INFORMATION BULLETIN-WBJEE-2025 2 WBJEEB ...
2,2,2674,440,29,668.5,INFORMATION BULLETIN-WBJEE-2025 3 WBJEEB ...
3,3,2264,394,17,566.0,INFORMATION BULLETIN-WBJEE-2025 4 WBJEEB ...
4,4,2006,411,3,501.5,INFORMATION BULLETIN-WBJEE-2025 5 WBJEEB ...


In [4]:
nlp = English()
nlp.add_pipe("sentencizer")
doc = nlp("This is s. This is d. This is sh.")

list(doc.sents)

[This is s. This is d. This is sh.]

In [5]:
for item in tqdm(pages_and_texts, desc = "Extracting Sentences"):
    doc = nlp(item["text"])
    item["sentences"] = [str(sentence) for sentence in doc.sents]
    item["page_sentence_count_spacy"] = len(item["sentences"])

Extracting Sentences: 100%|██████████| 470/470 [00:02<00:00, 162.18it/s]


In [6]:
def chunk_sentences(pages_and_texts):
    for item in tqdm(pages_and_texts, desc="Extracting Sentences"):
        doc = nlp(item["text"])
        item["sentences"] = [str(sentence) for sentence in doc.sents]
        item["sentence_chunks"] = [item["sentences"][i:i+10] for i in range(0, len(item["sentences"]), 10)]
    return pages_and_texts

In [7]:
random.sample(pages_and_texts, k = 3)

[{'page_number': 29,
  'page_char_count': 1057,
  'page_word_count': 248,
  'page_sentence_count_raw': 8,
  'page_token_count': 264.25,
  'text': 'INFORMATION BULLETIN-WBJEE-2025   30   WBJEEB  APPENDIX – 4  Proforma for Income Certificate  Certified that Total Annual Income From all sources of       ,  guardian of          residing at            Post Office       Police Station      in the district  of      in the state of West Bengal for the financial year 2024-2025  is less than Rs. 2.50 lakhs (Rupees two lakhs and fifty thousand only) and stands at Rs.     (Rupees  )  Paste a 4 cmx3 cm size recent  colour photograph of the  candidate in this box. The  photo must be attested by the  certifying authority.  Candidate’s signature  (Candidate’s Photograph)  Candidate must sign here in front of the certifying  authority.  Signature of Certifying Authority:     Full Name of Certifying Authority (Block Letter)     Designation with Official Seal  Office Address:          Office Phone No.   

In [8]:
num_sentence_chunk_size = 10
def split_list(input_list, slice_size = num_sentence_chunk_size):
    return [input_list[i:1+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [10], []]

In [9]:
for item in tqdm(pages_and_texts, desc = "Creating chunks"):
    item["sentence_chunks"] = split_list(input_list = item["sentences"])
    item["num_chunks"] = len(item["sentence_chunks"])

Creating chunks: 100%|██████████| 470/470 [00:00<00:00, 103803.01it/s]


In [10]:
random.sample(pages_and_texts, k=1)

[{'page_number': 255,
  'page_char_count': 1383,
  'page_word_count': 346,
  'page_sentence_count_raw': 10,
  'page_token_count': 345.75,
  'text': 'राष्ट्रीय परीक्षा एजेंसी                              Information Bulletin - 2025                                 National Testing Agency                                                               Excellence in Assessment         35    5.10. Weeding Out Rules   The record of Joint Entrance Examination JEE (Main) - 2025 will be preserved up to 90 days from  the date of declaration of result.  5.11 Legal Jurisdiction  All disputes pertaining to the conduct of the JEE (Main) – 2025 Examination including Results shall  fall within the jurisdiction of Delhi/New Delhi only. Further, any grievance/representation  arising out of the Result shall be entertained only when raised within 30 days from the declaration  of the result. Further, the copy of the Legal Notice or Writ Petition/ Petition/Appeal/ Reply may  be served to NTA only through: leg

In [12]:
df = pd.DataFrame(pages_and_texts)
df.describe().round(2).head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy,num_chunks
count,470.0,470.0,470.0,470.0,470.0,470.0,470.0
mean,234.5,1620.17,326.7,11.1,405.04,9.96,1.54
std,135.82,1219.82,338.69,10.65,304.96,9.46,0.95
min,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,117.25,305.0,145.0,1.0,76.25,1.0,1.0


In [13]:
pages_and_chunks = []
for item in tqdm(pages_and_texts, desc = "Building final structure"):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        joined_sentence_chunk = " ".join(sentence_chunk).replace(" ", " ").strip()
        joined_sentence_chunk = re.sub(r"\.([A-Z])", r".\1", joined_sentence_chunk)
        chunk_dict["chunk"] = joined_sentence_chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ") if word])
       
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk)/4

        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)


Building final structure: 100%|██████████| 470/470 [00:00<00:00, 22388.68it/s]


724

In [14]:
random.sample(pages_and_chunks, k=1)

[{'page_number': 186,
  'chunk': '© JEE (Advanced) 2025 – Information Brochure            88          FORM-PwD (III)  Form-III  Disability Certificate  (In cases of multiple disabilities)  (NAME AND ADDRESS OF THE MEDICAL AUTHORITY ISSUING THE  CERTIFICATE)  (See rule 4)      Certificate No.__________________________________                             Date: __________    This is to certify that I have carefully examined  Shri/Smt./Kum.____________________________________________ son/ wife/daughter of  Shri____________________________________________________  Date of Birth (DD/MM/YY) _________________________ Age_________ years,  male/female________________ Registration No. _____________________________________  permanent resident of House No. _________________________________ Ward/Village/Street  _____________________________ Post Office _________________________________  District ______________________________ State  __________________________________________, whose photograph is aff

In [15]:
df.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text,sentences,page_sentence_count_spacy,sentence_chunks,num_chunks
0,0,266,41,1,66.5,West Bengal Joint Entrance Examinations Board ...,[West Bengal Joint Entrance Examinations Board...,1,[[West Bengal Joint Entrance Examinations Boar...,1
1,1,447,70,3,111.75,INFORMATION BULLETIN-WBJEE-2025 2 WBJEEB ...,[INFORMATION BULLETIN-WBJEE-2025 2 WBJEEB ...,3,[[INFORMATION BULLETIN-WBJEE-2025 2 WBJEEB...,1
2,2,2674,440,29,668.5,INFORMATION BULLETIN-WBJEE-2025 3 WBJEEB ...,[INFORMATION BULLETIN-WBJEE-2025 3 WBJEEB ...,29,[[INFORMATION BULLETIN-WBJEE-2025 3 WBJEEB...,3
3,3,2264,394,17,566.0,INFORMATION BULLETIN-WBJEE-2025 4 WBJEEB ...,[INFORMATION BULLETIN-WBJEE-2025 4 WBJEEB ...,17,[[INFORMATION BULLETIN-WBJEE-2025 4 WBJEEB...,2
4,4,2006,411,3,501.5,INFORMATION BULLETIN-WBJEE-2025 5 WBJEEB ...,[INFORMATION BULLETIN-WBJEE-2025 5 WBJEEB ...,3,[[INFORMATION BULLETIN-WBJEE-2025 5 WBJEEB...,1


In [16]:
min_token_length = 30
chunks_df = pd.DataFrame(pages_and_chunks)
for row in chunks_df[chunks_df["chunk_token_count"] <= min_token_length].sample(5).iterrows():
    print(f"Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["chunk"]}")
    

Chunk token count: 0.75 | Text: 11.
Chunk token count: 0.0 | Text: 
Chunk token count: 0.0 | Text: 
Chunk token count: 0.5 | Text: 65
Chunk token count: 24.5 | Text: The Institute has well-established Ph.D. Programmes in Mathematics, Computer Science, and Physics.


In [17]:
pages_and_chunks_min_token_len = chunks_df[chunks_df["chunk_token_count"]>min_token_length].to_dict(orient = "records")
pages_and_chunks_min_token_len[:2]

[{'page_number': 0,
  'chunk': 'West Bengal Joint Entrance Examinations Board  West Bengal    Engineering /Technology  Date of Examination  27.04.2025 (Sunday)  Architecture  Pharmacy  Common Entrance Test for admission to UG Courses in Engineering/Technology, Pharmacy and Architecture  WBJEE-2025',
  'chunk_char_count': 266,
  'chunk_word_count': 31,
  'chunk_token_count': 66.5},
 {'page_number': 1,
  'chunk': 'INFORMATION BULLETIN-WBJEE-2025   2   WBJEEB  INFORMATION BULLETIN  WBJEE-2025  Common Entrance Test for admission to UG Courses in  Engineering/Technology, Pharmacy and Architecture  Candidates must go through the Information Bulletin  carefully before registration for the entrance examination.  West Bengal Joint Entrance Examinations Board  RUPANNA  DB-118, Sector-I, Salt Lake City  Kolkata 700064  Toll free No.- 1800-123-4782 (Extn. No.- 2)',
  'chunk_char_count': 447,
  'chunk_word_count': 55,
  'chunk_token_count': 111.75}]

In [18]:
def flatten_chunks(pages_and_texts):
    pages_and_chunks = []
    for item in tqdm(pages_and_texts, desc="Building chunks"):
        for chunk in item["sentence_chunks"]:
            joined = " ".join(chunk).strip()
            if len(joined)/4 > 10:
                pages_and_chunks.append({
                    "page_number": item["page_number"],
                    "chunk": joined,
                    "embedding": None
                })
    return pages_and_chunks

In [19]:
def embed_chunks(pages_and_chunks):
    model = SentenceTransformer("all-mpnet-base-v2", device="cpu")
    for item in tqdm(pages_and_chunks, desc="Generating embeddings"):
        item["embedding"] = model.encode(item["chunk"])
    return model, pages_and_chunks


In [20]:
def collect_preferences():
    print(" Hey there! I’m your college guide bot. I’ll help you shortlist the best-fit engineering colleges for you.\n")
    print("Let’s chat a bit so I can understand what you're looking for. Ready? Let's go!\n")

    preferences = {}

    preferences["general"] = input(" What are some things you’re looking for in a college? (e.g., branch, location, campus life, research, etc.)\n> ")

    preferences["branch"] = input("\n Do you have a preferred engineering branch or field? (If not sure, just say that!)\n> ")

    preferences["location"] = input("\n Any preferred states or cities for college?\n> ")

    preferences["college_type"] = input("\n Do you prefer a government college (like NITs/IITs) or private universities — or are you open to both?\n> ")

    preferences["campus_life"] = input("\n How important is campus life (clubs, fests, student activities) for you?\n> ")

    preferences["academics_vs_fun"] = input("\n Would you like a college that’s more academically focused, fun/social, or a mix of both?\n> ")

    preferences["budget"] = input("\n Are there any budget or fee constraints I should know about?\n> ")

    preferences["extra_notes"] = input("\n Anything else you'd like me to keep in mind? (e.g., placements, internships, foreign exchange, etc.)\n> ")

    print("\nThanks! Let me think for a moment... \n")

    
    preference_summary = (
        f"General preferences: {preferences['general']}\n"
        f"Preferred branch: {preferences['branch']}\n"
        f"Preferred location: {preferences['location']}\n"
        f"College type preference: {preferences['college_type']}\n"
        f"Campus life importance: {preferences['campus_life']}\n"
        f"Academic vs Fun balance: {preferences['academics_vs_fun']}\n"
        f"Budget/Fees: {preferences['budget']}\n"
        f"Additional notes: {preferences['extra_notes']}"
    )

    return preferences


In [21]:
def chunk_pdf(pages_and_texts, chunk_size=500):
    pages_and_chunks = []
    for page in pages_and_texts:
        text = page["text"]
        
        chunks = wrap(text, chunk_size)
        for chunk in chunks:
            pages_and_chunks.append({
                "chunk": chunk,
                "page_number": page["page_number"]
            })
    return pages_and_chunks

In [23]:
model, embedded_chunks = embed_chunks(pages_and_chunks)




Generating embeddings: 100%|██████████| 724/724 [06:36<00:00,  1.83it/s]


In [24]:
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_min_token_len)
text_chunks_and_embeddings_df.head()

Unnamed: 0,page_number,chunk,chunk_char_count,chunk_word_count,chunk_token_count
0,0,West Bengal Joint Entrance Examinations Board ...,266,31,66.5
1,1,INFORMATION BULLETIN-WBJEE-2025 2 WBJEEB ...,447,55,111.75
2,2,INFORMATION BULLETIN-WBJEE-2025 3 WBJEEB ...,890,136,222.5
3,3,INFORMATION BULLETIN-WBJEE-2025 4 WBJEEB ...,1337,203,334.25
4,3,• If a candidate does not keep himself/herself...,134,21,33.5


In [25]:
embeddings_df_save_path = r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index = False)

In [26]:
pages_and_chunks[2]

{'page_number': 2,
 'chunk': 'INFORMATION BULLETIN-WBJEE-2025   3   WBJEEB  IMPORTANT INSTRUCTIONS TO CANDIDATES WHILE REGISTERING FOR  WBJEE-2025  After an application is received, it will be assumed that the applicant accepts all terms, conditions,  and guidelines listed in the Information Bulletin and any relevant notices that the Board issued for  that purpose.  Any application not in compliance with the conditions specified in the Information Bulletin  is liable to be rejected.  1. Application for the examination must be done online only. No printed application form is  available.  2. Ensure to fill up the genuine application form available online only at www.wbjeeb.nic.in  3. Do not attempt to make any duplicate application.  4. It is essential to have a valid mobile number and a unique, valid email ID.  All future communications by the Board will be sent to the registered mobile number and email  ID.',
 'chunk_char_count': 890,
 'chunk_word_count': 136,
 'chunk_token_count': 222

In [27]:
def print_wrapped(text, wrap_length = 50):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
query = "Programs offered at IIT Delhi"
print(f"Query: {query}")
for score, idx in zip(top_results[0], top_results[1]):
    print(f"Score: {score:.4f}")
    print("Text:")
    print(pages_and_chunks[idx]["chunk"])
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

In [28]:
def dot_product(vector1, vector2):
    return torch.dot(vector1, vector2)
def cosine_similarity(vector1, vector2):
    dot_product = torch.dot(vector1, vector2)
    norm_vector1 = torch.sqrt(torch.sum(vector1**2))
    norm_vector2 = torch.sqrt(torch.sum(vector2**2))
    return dot_product / (norm_vector1 * norm_vector2)



In [29]:
def retrieve_sources(query, pages_and_chunks, model, number_to_return=5, print_time=True):
    
    query_embedding = model.encode(query, convert_to_tensor=True)
    start_time = timer()


    embeddings = [item["embedding"] for item in pages_and_chunks]

    
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"Retrieved in {end_time - start_time:.2f} seconds")

    
    top_results = torch.topk(dot_scores, k=number_to_return)
    return top_results.values, top_results.indices


def print_top_results_and_scores(query: str, pages_and_chunks, model: SentenceTransformer, number_to_return: int = 5):
    scores, indices = retrieve_sources(query, pages_and_chunks, model, number_to_return=number_to_return)
    for score, idx in zip(scores, indices):
        print(f"Score: {score:.4f}")  
        print("Text:")
        print(pages_and_chunks[idx.item()]["chunk"])  
        print(f"Page number: {pages_and_chunks[idx.item()]['page_number']}")
        print("\n" + "-"*60 + "\n")


In [None]:
query = "2+2 dual degree programs"

print_top_results_and_Scores(
    query=query, 
    embeddings=embeddings, 
    pages_and_chunks=pages_and_chunks,
    model=model)

In [30]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyDWUo-rusTBagnDnQi-GFSKwzQ1VKdV8iQ")
gemini_model = genai.GenerativeModel("gemini-2.5-pro-exp-03-25")

In [31]:
def collect_user_preferences():
    print("Hey there! I’m your college guide bot. I’ll help you shortlist the best-fit engineering colleges for you.\n")
    print("Let’s chat a bit so I can understand what you're looking for. Ready? Let's go!\n")

    preferences = {}

    preferences["general"] = input("What are some things you’re looking for in a college? (e.g., branch, location, campus life, research, etc.)\n> ")
    preferences["branch"] = input("\nDo you have a preferred engineering branch or field? (If not sure, just say that!)\n> ")
    preferences["location"] = input("\nAny preferred states or cities for college?\n> ")
    preferences["college_type"] = input("\nDo you prefer a government college (like NITs/IITs) or private universities — or are you open to both?\n> ")
    preferences["campus_life"] = input("\nHow important is campus life (clubs, fests, student activities) for you?\n> ")
    preferences["academics_vs_fun"] = input("\nWould you like a college that’s more academically focused, fun/social, or a mix of both?\n> ")
    preferences["budget"] = input("\nAre there any budget or fee constraints I should know about?\n> ")
    preferences["extra_notes"] = input("\nAnything else you'd like me to keep in mind? (e.g., placements, internships, foreign exchange, etc.)\n> ")

    gave_exam = input("\nHave you given any entrance exam yet? (yes/no)\n> ").strip().lower()
    if gave_exam == "yes":
        preferences["gave_exam"] = True
        preferences["exam"] = input("\nWhich entrance exam did you give? (e.g., JEE Main, BITSAT, etc.)\n> ")
        preferences["score"] = input("\nWhat was your score or percentile?\n> ")
    else:
        preferences["gave_exam"] = False

    print("\nThanks! Let me think for a moment... \n")
    return preferences


In [32]:
def generate_final_response_with_gemini(preferences, top_chunks):
    context = "\n\n".join([chunk["chunk"] for chunk in top_chunks])
    prompt = (
        f"You are a helpful college counselor.\n\n"
        f"Student Preferences:\n"
        f"- Preferred Branch: {preferences['branch']}\n"
        f"- Location Preference: {preferences['location']}\n"
        f"- College Type: {preferences['college_type']}\n"
        f"- Campus Life Importance: {preferences['campus_life']}\n"
        f"- Academic vs Fun Preference: {preferences['academics_vs_fun']}\n"
        f"- Budget: {preferences['budget']}\n"
        f"- General Interests: {preferences['general']}\n"
        f"- Other Notes: {preferences['extra_notes']}\n\n"
        f"Based on the official brochure below, strictly recommend up to 3 engineering colleges in India that are the best fit.\n\n"
        f"--- Brochure Snippets ---\n{context}"
    )

    response = gemini_model.generate_content(prompt)
    print("\n🎓 Final Recommendations:\n")
    print(response.text)


In [33]:
def main():
    pages = open_and_read_pdf(r"C:\Users\ujjwal\OneDrive\Desktop\Projects\CC_Task2\Engg_colleges.pdf")
    pages = chunk_sentences(pages)
    pages_and_chunks = flatten_chunks(pages)
    embedding_model, pages_and_chunks = embed_chunks(pages_and_chunks)

    preferences = collect_user_preferences()
    query = (
    f"{preferences['branch']} engineering programs "
    f"in {preferences['location']} "
    f"at {preferences['college_type']} colleges "
    f"with focus on {preferences['general']} "
    f"and a budget of {preferences['budget']}"
)

    scores, indices = retrieve_sources(query, pages_and_chunks, embedding_model)
    top_chunks = [pages_and_chunks[idx.item()] for idx in indices]

    generate_final_response_with_gemini(preferences, top_chunks)

In [35]:
if __name__ == "__main__":
    main()


Reading PDF: 470it [00:05, 93.25it/s] 
Extracting Sentences: 100%|██████████| 470/470 [00:01<00:00, 265.32it/s]
Building chunks: 100%|██████████| 470/470 [00:00<00:00, 235325.64it/s]
Generating embeddings: 100%|██████████| 706/706 [08:27<00:00,  1.39it/s]


Hey there! I’m your college guide bot. I’ll help you shortlist the best-fit engineering colleges for you.

Let’s chat a bit so I can understand what you're looking for. Ready? Let's go!


Thanks! Let me think for a moment... 

Retrieved in 0.07 seconds

🎓 Final Recommendations:

Okay, let's look at the provided brochure snippets and your preferences to find the best fit. You're looking for Electrical Engineering, preferably in Delhi or Mumbai, with a focus on academics, research, good placements, and foreign exchange opportunities.

Based *strictly* on the information in the provided snippets:

1.  **Indian Institute of Technology Delhi (IITD):**
    *   **Location:** Located in New Delhi, matching your Delhi preference (Snippet 4).
    *   **Institution Type:** It's an IIT, listed as a participating institute for JEE (Advanced) 2025 (Snippets 1 & 4). IITs are premier government institutions known for strong academics, research focus, and excellent placements, aligning with your prefer