In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Personalized AI Career Coach for Students

## The Problem: Navigating Career Paths is Overwhelming for Students

Many students struggle to make informed career decisions due to the overwhelming number of options, the disconnect between academic studies and job market demands, and the lack of personalized guidance. Traditional career counseling often lacks scalability, is generic, and doesn’t keep pace with emerging job roles or tech trends.

## The Solution: Generative AI-Powered Career Assistant

This notebook demonstrates how **Generative AI** can revolutionize career guidance by:
- **Parsing and understanding student documents** (like resumes or transcripts)
- **Extracting structured profiles** from unstructured text
- **Generating smart, contextual queries** based on student profiles
- **Retrieving relevant job and learning data** from a vector database (RAG - Retrieval-Augmented Generation)
- **Producing personalized career recommendations** including emerging job roles, suggested learning paths, and upskilling advice

## How We Implemented It

In the notebook, we walk through the entire solution pipeline with code:
- Extracting content from **PDF/DOCX files**
- Using **LLMs (like Gemini Pro)** to understand and summarize documents
- Generating a **natural language query** from the student profile
- Retrieving top matching jobs and course recommendations using **semantic search and embeddings** (via `all-MiniLM-L6-v2`)
- Delivering final AI-powered career advice using **Generative AI**

This project highlights the power of GenAI in solving real-world problems, particularly in **education and career planning**.

**Next Steps:** We are currently working on turning this into a web application where users can simply upload their CVs or academic documents and instantly receive personalized, AI-driven career guidance—making impactful advice accessible to every student.


## Vector Store

In [46]:
pip install -q faiss-cpu datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import faiss
import json
import os
from tqdm import tqdm

from sklearn.preprocessing import normalize
from typing import List
from datasets import load_dataset
import kagglehub

In [4]:
# Hugging Face Datasets
job_descriptions = load_dataset("batuhanmtl/job-skill-set")["train"]
required_skillsets = load_dataset("aicinema69/Resume-Job")["train"]
salaries_skillsets = load_dataset("will4381/job-posting-classification")["train"]

# KaggleHub Datasets — returns file path, so load them manually
kaggle_job_path = kagglehub.dataset_download("arshkon/linkedin-job-postings")
kaggle_coursera_path = kagglehub.dataset_download("anusreemohanan/coursera-course-details")
kaggle_combined_path = kagglehub.dataset_download("kararhaitham/courses")

README.md:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.52M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1167 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

lama2_formatted_data.json:   0%|          | 0.00/19.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2277 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/729 [00:00<?, ?B/s]

parsed_job_data.csv:   0%|          | 0.00/164M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33233 [00:00<?, ? examples/s]

In [5]:
# Load CSVs/JSONs
linkedin_jobs = pd.read_csv(os.path.join(kaggle_job_path, "postings.csv"))
coursera_courses = pd.read_csv(os.path.join(kaggle_coursera_path, "coursera.csv"), encoding='latin-1')
combined_courses = pd.read_json(os.path.join(kaggle_combined_path, "combined_dataset.json"))

In [6]:
chunks = []

# Process batuhanmtl/job-skill-set
for row in job_descriptions:
    title = row.get("job_title", "")
    desc = row.get("job_description", "")
    skills = ", ".join(row.get("job_skill_set", [])) if row.get("job_skill_set") else ""
    combined = f"{title}. {desc}. Required skills: {skills}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "job", "source": "huggingface_batuhanmtl", "title": title}
    })

In [7]:
# Process aicinema69/Resume-Job
for row in required_skillsets:
    title = row.get("job_title", "")
    skills = row.get("skills_required", "")
    desc = row.get("description", "")
    combined = f"{title}. Description: {desc}. Required Skills: {skills}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "job", "source": "huggingface_aicinema69", "title": title}
    })

In [8]:
# Process will4381/job-posting-classification
for row in salaries_skillsets:
    title = row.get("title", "")
    category = row.get("category", "")
    desc = row.get("original_description", "")
    combined = f"{title}. {desc}. Category: {category}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "job", "source": "huggingface_will4381", "title": title}
    })

In [9]:
# Process Kaggle: arshkon/linkedin-job-postings
for _, row in linkedin_jobs.iterrows(): # Changed job_listings to linkedin_jobs
    title = row.get("Job Title", "")
    desc = row.get("Description", "")
    salary = row.get("Salary Estimate", "")
    combined = f"{title}. {desc}. Salary: {salary}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "job", "source": "kaggle_linkedin", "title": title}
    })

In [10]:
# Process Kaggle: anusreemohanan/coursera-course-details
for _, row in coursera_courses.iterrows():
    name = row.get("Course Name", "")
    desc = row.get("Course Description", "")
    level = row.get("Level", "")
    combined = f"{name}. Description: {desc}. Level: {level}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "course", "source": "kaggle_coursera", "title": name}
    })

In [11]:
# Process Kaggle: kararhaitham/courses (combined)
for _, row in combined_courses.iterrows():
    name = row.get("course_title", "")
    desc = row.get("description", "")
    platform = row.get("platform", "")
    combined = f"{name}. {desc}. Platform: {platform}."
    chunks.append({
        "text": combined.strip(),
        "metadata": {"type": "course", "source": "kaggle_combined_courses", "title": name}
    })

In [12]:
print(f"Total chunks: {len(chunks)}")

Total chunks: 180855


In [13]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast & good enough for most tasks

texts = [chunk['text'] for chunk in chunks]
embeddings = model.encode(texts, show_progress_bar=True, batch_size=64, normalize_embeddings=True)

2025-04-20 04:57:54.038546: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745125074.263617      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745125074.323591      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2826 [00:00<?, ?it/s]

In [14]:
# Create a FAISS index
dim = embeddings.shape[1]  # The dimension of the embeddings
index = faiss.IndexFlatL2(dim)  # Use L2 distance (Euclidean distance)

# Convert embeddings to a numpy array (FAISS requires this format)
embeddings_np = np.array(embeddings).astype(np.float32)

# Add embeddings to the FAISS index
index.add(embeddings_np)

# Save the FAISS index for later use
faiss.write_index(index, "career_coach_index.index")

In [15]:
def search_vector(query: str, top_k: int = 5):
    # Generate the embedding for the query
    query_embedding = model.encode([query], normalize_embeddings=True)

    # Perform the search in the FAISS index
    query_embedding = np.array(query_embedding).astype(np.float32)
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the closest matching chunks based on indices
    results = []
    for i, idx in enumerate(indices[0]):
        result = chunks[idx]
        result["distance"] = distances[0][i]  # Include the distance (similarity score)
        results.append(result)

    return results

## Document Parser

In [18]:
!pip install -q PyMuPDF python-docx google-generativeai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
import fitz
import docx

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    return "\n".join(page.get_text() for page in doc)

def extract_text_from_docx(docx_path):
    doc = docx.Document(docx_path)
    return "\n".join(para.text for para in doc.paragraphs)

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    else:
        raise ValueError("Unsupported file type.")

In [20]:
# Upload your file (use appropriate path)
file_path = "/kaggle/input/samplecv/sampleCV.pdf" 

In [21]:
# Extract text
raw_text = extract_text(file_path)

In [22]:
print(raw_text)

EDUCATION
RICHARD SANCHEZ
MARKETING MANAGER 
CONTACT
+123-456-7890
hello@reallygreatsite.com
123 Anywhere St., Any City
www.reallygreatsite.com
SKILLS
Project Management
Public Relations
Teamwork
Time Management
Leadership
Effective Communication
Critical Thinking
WARDIERE UNIVERSITY
Master of Business
Management
2029 - 2030
2025 - 2029 
WARDIERE UNIVERSITY
Bachelor of Business
GPA: 3.8 / 4.0
English (Fluent)
French (Fluent)
German (Basics)
Spanish (Intermediate)
LANGUAGES
WORK EXPERIENCE
REFERENCE
PROFILE
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam quis
nostrud exercitation. Lorem ipsum dolor sit amet, consectetur adipiscing elit,
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad
minim veniam quis nostrud exercitation. Ut enim ad minim veniam quis nostrud
exercitation.
123-456-7890
hello@reallygreatsite.com
Harper Richard
Phone: 
Email :
Wardiere Inc. / C

In [26]:
import google.generativeai as genai
import os
from kaggle_secrets import UserSecretsClient

In [29]:
# Gemini API Key
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)  # 🔐 Replace with your API key

In [30]:
# Structured Output Using Gemini

def get_structured_profile(text):
    model = genai.GenerativeModel("gemini-2.0-flash")

    prompt = f"""
    You are a helpful assistant. Read the resume or transcript text below and extract the following fields in valid JSON format:
    - name (string)
    - degree (list of strings)
    - GPA (float, only the value obtained by the student — not the scale)
    - skills (list of strings)
    - interests (list of strings)
    Resume/Transcript Text:
    \"\"\"
    {text}
    \"\"\"
    Only return JSON object. Do not Do not explain anything.
    """
    response = model.generate_content(prompt)
    return response.text

In [33]:
import json
import re

profile_json = get_structured_profile(raw_text)
profile_json = re.sub(r"^```(?:json)?\s*|\s*```$", "", profile_json.strip())

In [34]:
# View result
print("Extracted Student Profile:\n")
print(profile_json)

Extracted Student Profile:

{
  "name": "Richard Sanchez",
  "degree": [
    "Master of Business Management",
    "Bachelor of Business"
  ],
  "GPA": 3.8,
  "skills": [
    "Project Management",
    "Public Relations",
    "Teamwork",
    "Time Management",
    "Leadership",
    "Effective Communication",
    "Critical Thinking"
  ],
  "interests": []
}


In [35]:
student_profile_json = profile_json

# Parse the JSON into a Python dictionary
try:
    profile = json.loads(student_profile_json)
except json.JSONDecodeError as e:
    raise ValueError(f"Invalid JSON input: {e}")

# Validate required fields
required_fields = ["degree", "GPA", "interests", "skills"]
missing = [field for field in required_fields if field not in profile]

if missing:
    raise ValueError(f"Missing required fields: {missing}")

In [36]:
# Smart Query Construction
degree = profile.get("degree", "unknown degree")
gpa = profile.get("GPA", 0.0)
interests = profile.get("interests", [])
skills = profile.get("skills", [])

# Helper to phrase lists nicely
def list_to_phrase(items):
    if len(items) == 0:
        return ""
    elif len(items) == 1:
        return items[0]
    else:
        return ", ".join(items[:-1]) + " and " + items[-1]

# Analyze strength level based on GPA
if gpa >= 3.5:
    performance = "strong academic performance"
elif 3.0 <= gpa < 3.5:
    performance = "average academic performance"
else:
    performance = "needs improvement in academic performance"

# Build a flexible natural language query
query = (
    f"Suggest high-potential career opportunities and emerging job roles for a student "
    f"with a {degree}, {performance}, skilled in {list_to_phrase(skills)}, and interested in {list_to_phrase(interests)}. "
    f"Also recommend relevant certifications or learning paths to enhance employability in these fields."
)

print("Final RAG Query:\n")
print(query)

Final RAG Query:

Suggest high-potential career opportunities and emerging job roles for a student with a ['Master of Business Management', 'Bachelor of Business'], strong academic performance, skilled in Project Management, Public Relations, Teamwork, Time Management, Leadership, Effective Communication and Critical Thinking, and interested in . Also recommend relevant certifications or learning paths to enhance employability in these fields.


In [37]:
# Call search_vector() function
retrieved_results = search_vector(query, top_k=5)

# Check if results are retrieved
if not retrieved_results:
    print("No documents retrieved! Please check the query.")
else:
    print(f"Retrieved {len(retrieved_results)} documents.\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Retrieved 5 documents.



In [38]:
prepared_docs = []

for result in retrieved_results:
    title = result.get('metadata', {}).get('title', 'No Title')
    text = result.get('text', 'No Text Available')
    distance = result.get('distance', None)

    # Relaxed filter
    if distance is not None and distance > 0.8:  # Not very strict now
        continue

    combined_text = f"Title: {title}\nContent: {text}\nSimilarity Score: {distance}\n"
    prepared_docs.append((combined_text, distance))  # Save both text + distance for sorting later

# Sort by distance (smallest first = most similar)
prepared_docs = sorted(prepared_docs, key=lambda x: x[1])

# Take only top 5
prepared_docs = prepared_docs[:5]

# Print nicely
for idx, (doc, distance) in enumerate(prepared_docs, start=1):
    print(f"Document {idx}:\n{doc}\n{'-'*80}\n")

Document 1:
Title: 
Content: . Job Title: Project ManagerLocation: Columbus, OH (Hybrid)Duration: C2H
Job Description:• BS/BA degree (preferred) or equivalent experience• 3+ years of demonstrated experience in vendor, partner, or relationship management• Experience with third party risk oversight (preferred)• Strong competencies in use of MS Office packages (e.g., Excel, PowerPoint)• Strong at Multi-Tasking and time management discipline• Working with clients and Project management experience would be beneficial.• Strong understanding of banking and departmental procedures, functions, and activities
Ideal Candidates Would Be Able to Do the Following:• Demonstrate excellent relationship management, organizational and time management skills, and the ability to partner with internal stakeholders and external vendors to prioritize and deliver initiatives• Ability to obsess about the overall customer experience and provide a world class vendor management service.• Exemplary negotiation skil

In [39]:
def suggest_career_path_with_gemini(prepared_docs, student_profile_json):
    """
    Suggests career paths, required skills, and courses using Gemini and prepared documents.

    Args:
        prepared_docs (list): List of tuples containing (document_text, distance).
        student_profile_json (str): JSON string representing the student's profile.

    Returns:
        dict: A dictionary with keys 'career_paths', 'required_skills', and 'recommended_courses'.
    """

    # Prepare context for Gemini
    context = "\n".join([doc for doc, _ in prepared_docs])
    context += f"\n\nStudent Profile:\n{student_profile_json}\n"
    context += "Databases:\nlinkedin_jobs, coursera_courses, combined_courses\n"

    # Construct the prompt for a more conversational response
    prompt = f"""
You are a personalized AI career coach. Based on the student's profile and job description provided, suggest suitable career paths, required skills, and recommend courses. Present the results in the following format:
Context:
{context}

Hi, {{name}}, I'm your personalized AI career coach.
Based on your profile and the job description you provided, here's what I recommend:

These are the career paths I suggest for you: {{career_paths}}.

These are the required skills you should focus on: {{required_skills}}. You already have a strong foundation in areas like project management, time management, and communication, so focusing on the specifics mentioned in the job description will make you a stronger candidate.

These are the courses I would recommend to you:
{{courses_list}}

Format the career paths, required skills, and courses in a clean, readable format without extra markdown or quotes.
"""

    # Generate response using Gemini
    model = genai.GenerativeModel("gemini-2.0-flash")
    response = model.generate_content(prompt)
    output_text = response.text.strip()

    # Use `re` to clean up and format the response
    # This regex will clean up unnecessary markdown and add the personalized format
    output_text = re.sub(r"^```(?:json)?\s*|\s*```$", "", output_text.strip())  # Remove any code block
    output_text = re.sub(r"career paths", "career paths", output_text, flags=re.IGNORECASE)  # Ensure consistency in capitalization

    # Format the final response with re.sub to insert the correct labels in the response
    output_text = re.sub(
        r"(career paths.*?:)(.*?)\.",
        r"These are the career paths I suggest for you: \2.",
        output_text
    )

    output_text = re.sub(
        r"(required skills.*?:)(.*?)\.",
        r"These are the required skills you should focus on: \2.",
        output_text
    )

    output_text = re.sub(
        r"(recommended courses.*?:)(.*?)\.",
        r"These are the courses I would recommend to you: \2.",
        output_text
    )

    # Return the cleaned and formatted response
    return output_text

In [42]:
# Generate career advice using Gemini
career_advice = suggest_career_path_with_gemini(prepared_docs, student_profile_json)

In [45]:
from IPython.display import Markdown, display

# Display results
if career_advice:
    display(Markdown(career_advice))

Hi, Richard Sanchez, I'm your personalized AI career coach.
Based on your profile and the job description you provided, here's what I recommend:

These are the These are the career paths I suggest for you:  Project Manager, Vendor Manager, Relationship Manager, Business Analyst.

These are the These are the required skills you should focus on:  Vendor Management, Third-Party Risk Oversight, Negotiation, Contract Management, Risk Assessment, Banking Procedures, MS Office Suite (especially Excel and PowerPoint).

These are the courses I would recommend to you:

*   **Vendor Management:**
    *   Vendor Management: Mastering Vendor Selection and Relationships (Coursera)
    *   Supply Chain Management Specialization (Coursera) - Focus on modules related to supplier relationships.
*   **Third-Party Risk Management:**
    *   Third Party Risk Management (Various Platforms - Search on Coursera, LinkedIn Learning)
    *   Risk Management Professional (PMI-RMP) Certification Prep (Consider a prep course to understand risk management principles)
*   **Negotiation & Contract Management:**
    *   Successful Negotiation: Essential Strategies and Skills (Coursera)
    *   Contract Law: From Trust to Promise to Contract (Coursera)
*   **Banking & Financial Services:**
    *   Introduction to Finance (Coursera)
    *   Banking & Financial Institutions (Coursera) - Choose courses that cover banking procedures and regulations.
*   **Advanced MS Office Skills:**
    *   Microsoft Excel - Excel from Beginner to Advanced (Udemy)
    *   Mastering PowerPoint (LinkedIn Learning)