In [4]:
!pip install google-generativeai



In [19]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import torch
import json
import os
import datetime
from openai import OpenAI # Used for Ollama local client (optional)
import google.generativeai as genai # Import Google Generative AI library

# --- Configuration ---
# Path where your fine-tuned model is saved from Part 2
MODEL_PATH = "/content/drive/MyDrive/fine_tuned_qa_model_job_course" # Ensure this matches your fine-tuning script's output
# Path to your original dataset CSV files
DATASET_PATH = "/content/drive/MyDrive/dataset/" # Adjust this path
# File to store feedback logs
FEEDBACK_LOG_PATH = "feedback_log_qa_job_course_v3_gemini.jsonl" # New log file for this version

# Ollama Client setup (Optional, for advanced LLM explanations/summaries from local LLM)
OLLAMA_BASE_URL = "http://localhost:11434/v1"
OLLAMA_MODEL = "llama3" # Or the model you pulled
OLLAMA_TIMEOUT = 120.0

llm_client = None # Initialize as None for Ollama
try:
    llm_client = OpenAI(base_url=OLLAMA_BASE_URL, api_key="ollama", timeout=OLLAMA_TIMEOUT)
    llm_client.models.list() # Test connection
    print(f"Ollama client initialized for local model '{OLLAMA_MODEL}'. (Optional LLM for explanations)")
except Exception as e:
    print(f"Warning: Could not initialize Ollama client for local explanations: {e}")
    print("This feature will be disabled. Recommendations will be direct QA results (and Gemini fallback).")
    llm_client = None


# Gemini API setup (Fallback LLM)
# *** CORRECT AND SECURE WAY TO GET API KEY ***
GOOGLE_API_KEY = "AIzaSyB8rM9gxGzUCOAfDDI-UBlRPcjGzgFBTY8"
if not GOOGLE_API_KEY:
    print("WARNING: GOOGLE_API_KEY environment variable not set. Gemini fallback will be unavailable.")
    print("Please set it (e.g., `export GOOGLE_API_KEY='YOUR_API_KEY'`) before running the script.")
    gemini_model = None
else:
    try:
        genai.configure(api_key=GOOGLE_API_KEY) # The API key is used here from environment variable
        # Test Gemini connection
        gemini_test_model = genai.GenerativeModel('gemini-2.0-flash')
        gemini_test_model.generate_content("hello").text
        gemini_model = genai.GenerativeModel('gemini-2.0-flash')
        print("Gemini API configured successfully for fallback.")
    except Exception as e:
        print(f"Error configuring Gemini API: {e}")
        print("Gemini fallback will be unavailable. Check your API key and network connection.")
        gemini_model = None


# --- Load original datasets for contexts ---
try:
    jobs_df = pd.read_csv(os.path.join(DATASET_PATH, "job_skills.csv"))
    courses_df = pd.read_csv(os.path.join(DATASET_PATH, "coursea_data.csv"))
    print("New datasets (job_skills.csv, coursea_data.csv) loaded successfully for inference contexts.")
except FileNotFoundError as e:
    print(f"Error: One or more dataset files not found. Please check the DATASET_PATH and filenames. {e}")
    exit()
except Exception as e:
    print(f"Error loading datasets: {e}")
    exit()

# --- Initialize Local Embedding Model for RAG Retrieval ---
try:
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    print("SentenceTransformer (embedding) model loaded successfully for RAG.")
except Exception as e:
    print(f"Error loading SentenceTransformer model: {e}")
    exit()

# --- Prepare all contexts and build FAISS index for retrieval ---
all_contexts_for_faiss = []
all_context_metadata = []

def create_context_entry_for_faiss(row, context_type):
    if context_type == "job":
        company = str(row.get('Company', '')).strip()
        title = str(row.get('Title', '')).strip()
        category = str(row.get('Category', '')).strip()
        location = str(row.get('Location', '')).strip()
        responsibilities = str(row.get('Responsibilities', '')).strip()
        min_qual = str(row.get('Minimum Qualifications', '')).strip()
        pref_qual = str(row.get('Preferred Qualifications', '')).strip()

        text_parts = []
        if title: text_parts.append(f"Job Title: {title}")
        if company: text_parts.append(f"Company: {company}")
        if category: text_parts.append(f"Category: {category}")
        if location: text_parts.append(f"Location: {location}")
        if responsibilities: text_parts.append(f"Responsibilities: {responsibilities}")
        if min_qual: text_parts.append(f"Minimum Qualifications: {min_qual}")
        if pref_qual: text_parts.append(f"Preferred Qualifications: {pref_qual}")

        text = ". ".join(filter(None, text_parts))
    elif context_type == "course":
        course_title = str(row.get('course_title', '')).strip()
        course_organization = str(row.get('course_organization', '')).strip()
        course_certificate_type = str(row.get('course_Certificate_type', '')).strip()
        course_rating = str(row.get('course_rating', '')).strip()
        course_difficulty = str(row.get('course_difficulty', '')).strip()
        course_students_enrolled = str(row.get('course_students_enrolled', '')).strip()

        text_parts = []
        if course_title: text_parts.append(f"Course Title: {course_title}")
        if course_organization: text_parts.append(f"Organization: {course_organization}")
        if course_certificate_type: text_parts.append(f"Certificate Type: {course_certificate_type}")
        if course_rating: text_parts.append(f"Rating: {course_rating}")
        if course_difficulty: text_parts.append(f"Difficulty: {course_difficulty}")
        if course_students_enrolled: text_parts.append(f"Students Enrolled: {course_students_enrolled}")

        text = ". ".join(filter(None, text_parts))
    else:
        return None, None

    if text.strip():
        return text, {"type": context_type, "original_data": row.to_dict()}
    return None, None

# Populate contexts and build FAISS index
print("Building contexts for FAISS index...")
for idx, row in jobs_df.iterrows():
    context_text, metadata = create_context_entry_for_faiss(row, "job")
    if context_text:
        all_contexts_for_faiss.append(context_text)
        all_context_metadata.append(metadata)

for idx, row in courses_df.iterrows():
    context_text, metadata = create_context_entry_for_faiss(row, "course")
    if context_text:
        all_contexts_for_faiss.append(context_text)
        all_context_metadata.append(metadata)

if not all_contexts_for_faiss:
    print("No contexts generated for FAISS. Please check data processing.")
    exit()

print(f"Encoding {len(all_contexts_for_faiss)} contexts for FAISS...")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device for embedding: {device}")

context_embeddings = embedding_model.encode(
    all_contexts_for_faiss,
    show_progress_bar=True,
    convert_to_numpy=True,
    device=device
).astype('float32')

dimension = context_embeddings.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(context_embeddings)
print("FAISS index built.")

# --- Load the fine-tuned QA model and tokenizer ---
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
    print("Fine-tuned QA model loaded successfully.")
except Exception as e:
    print(f"Error loading fine-tuned QA model: {e}")
    print("Please ensure you have run the fine-tuning script (Part 2) and saved the model correctly.")
    exit()

qa_pipeline = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# --- Feedback Logging Function ---
def log_feedback(query, retrieved_qa_results, llm_response, user_feedback_text, source="Local_QA_RAG"):
    feedback_entry = {
        "timestamp": datetime.datetime.now().isoformat(),
        "query": query,
        "source": source, # Indicate if from Local QA or Gemini fallback
        "qa_results_preview": [
            {
                "score": res.get('score'),
                "answer": res.get('answer'),
                "context_type": res.get('context_type'),
                "original_data_id": res['original_data'].get('Unnamed: 0') or \
                                     res['original_data'].get('Title') if res['context_type'] == 'job' else \
                                     res['original_data'].get('course_title')
            } for res in retrieved_qa_results
        ],
        "llm_response_preview": llm_response[:500] + "..." if len(llm_response) > 500 else llm_response,
        "user_feedback": user_feedback_text,
    }
    with open(FEEDBACK_LOG_PATH, 'a', encoding='utf-8') as f:
        f.write(json.dumps(feedback_entry) + '\n')
    print("Feedback logged successfully.")


# --- Function to get recommendations from Gemini (Fallback) ---
def get_gemini_recommendations(query, item_type=None):
    # The API key is configured globally via genai.configure(api_key=GOOGLE_API_KEY)
    # So, we just need to use the gemini_model object directly.
    if not gemini_model: # Check if the global gemini_model was successfully initialized
        return "Gemini API is not available or not configured."

    print("\n--- Generating recommendations from Gemini (Fallback) ---")

    # Construct a comprehensive prompt for Gemini
    gemini_prompt = (
        f"The user is asking for recommendations based on the query: '{query}'.\n"
        f"They were not fully satisfied with previous automated recommendations. "
        f"Please provide relevant and helpful recommendations for {item_type if item_type else 'jobs and courses'}. "
        f"Suggest specific job titles, companies, course names, and organizations where applicable. "
        f"Explain why these are good recommendations and suggest a potential career/learning path."
        f"Be concise, actionable, and encouraging."
    )

    try:
        response = gemini_model.generate_content(gemini_prompt)
        return response.text
    except Exception as e:
        return f"Error communicating with Gemini API: {e}"

# --- Recommendation Function with RAG and Optional LLM Summary ---
def get_qa_recommendations_with_rag(query: str, k: int = 5, item_type: str = None, num_retrieved_contexts: int = 10):
    print(f"\nSearching for: '{query}'")

    query_embedding = embedding_model.encode([query], convert_to_numpy=True).astype('float32')
    D, I = faiss_index.search(query_embedding, num_retrieved_contexts)

    retrieved_contexts_for_qa = []
    system_info_for_log = []

    for idx in I[0]:
        if idx >= 0 and idx < len(all_contexts_for_faiss):
            item_metadata = all_context_metadata[idx]
            if item_type and item_metadata['type'] != item_type:
                continue

            context_text = all_contexts_for_faiss[idx]

            try:
                qa_result = qa_pipeline(question=query, context=context_text)

                if qa_result['score'] > 0.05 and qa_result['answer'].strip() != "":
                    retrieved_contexts_for_qa.append({
                        "score": qa_result['score'],
                        "answer": qa_result['answer'],
                        "context_type": item_metadata["type"],
                        "original_data": item_metadata["original_data"],
                        "full_context_text": context_text
                    })
                    system_info_for_log.append({
                        "retrieved_context_content_preview": context_text[:200] + "...",
                        "qa_answer": qa_result['answer'],
                        "qa_score": qa_result['score']
                    })
            except Exception as e:
                continue

    retrieved_contexts_for_qa.sort(key=lambda x: x['score'], reverse=True)

    final_qa_recommendations = []
    seen_unique_ids = set()
    for res in retrieved_contexts_for_qa:
        unique_id = None
        if res['context_type'] == 'job':
            unique_id = res['original_data'].get('Unnamed: 0')
            if unique_id is None: unique_id = res['original_data'].get('Title') + "_" + res['original_data'].get('Company')
        elif res['context_type'] == 'course':
            unique_id = res['original_data'].get('Unnamed: 0')
            if unique_id is None: unique_id = res['original_data'].get('course_title')

        if unique_id is not None and unique_id not in seen_unique_ids:
            final_qa_recommendations.append(res)
            seen_unique_ids.add(unique_id)
            if len(final_qa_recommendations) >= k:
                break

    llm_summary_text = ""
    if llm_client and final_qa_recommendations: # Only generate local LLM summary if local client is active AND we have recommendations
        llm_prompt_context = ""
        for i, rec in enumerate(final_qa_recommendations):
            llm_prompt_context += f"Recommendation {i+1} (Type: {rec['context_type'].upper()}):\n"
            if rec['context_type'] == 'job':
                llm_prompt_context += f"  Job Title: {rec['original_data'].get('Title')}\n"
                llm_prompt_context += f"  Company: {rec['original_data'].get('Company')}\n"
                llm_prompt_context += f"  Key Info Extracted: {rec['answer']}\n"
                llm_prompt_context += f"  Responsibilities Preview: {rec['original_data'].get('Responsibilities', '')[:100]}...\n"
            elif rec['context_type'] == 'course':
                llm_prompt_context += f"  Course Title: {rec['original_data'].get('course_title')}\n"
                llm_prompt_context += f"  Organization: {rec['original_data'].get('course_organization')}\n"
                llm_prompt_context += f"  Key Info Extracted: {rec['answer']}\n"
                llm_prompt_context += f"  Difficulty: {rec['original_data'].get('course_difficulty')}\n"
            llm_prompt_context += "---\n"

        system_message = (
            "You are an AI assistant that provides career and learning recommendations. "
            "You have retrieved specific job and course details based on a user's query. "
            "Your task is to synthesize these details into a concise, personalized summary. "
            "Explain *why* these specific items are relevant to the user's query and suggest a potential next step or learning path."
            "Do NOT just re-list the items. Focus on explanation and actionable advice."
        )
        user_message = (
            f"The user's query is: '{query}'\n\n"
            f"Here are {len(final_qa_recommendations)} specific recommendations with extracted key information:\n\n"
            f"{llm_prompt_context}\n"
            f"Please provide a personalized summary and a suggested path."
        )

        try:
            response = llm_client.chat.completions.create(
                model=OLLAMA_MODEL,
                messages=[
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                temperature=0.7,
                max_tokens=600
            )
            llm_summary_text = response.choices[0].message.content
            print("\n--- Local LLM's Personalized Recommendation & Summary ---")
            print(llm_summary_text)
        except Exception as e:
            print(f"Error calling local Ollama LLM for summary: {e}")
            llm_summary_text = "Could not generate a local LLM summary."

    return final_qa_recommendations, llm_summary_text

# --- Main Interaction Loop ---
if __name__ == "__main__":
    print("\nStarting Job and Course Recommendation System (Fine-tuned QA + RAG + Optional LLM Summary + Gemini Fallback).")
    print("Type 'quit' at any prompt to exit.")

    while True:
        user_query = input("\nEnter your query (e.g., 'Google Cloud Program Manager responsibilities', 'courses on data science', 'Django developer jobs'): ").strip()
        if user_query.lower() == 'quit':
            print("Exiting recommendation system. Goodbye!")
            break

        rec_type_input = input("Which type of recommendation? (jobs/courses/all): ").strip().lower()
        if rec_type_input not in ['jobs', 'courses', 'all']:
            print("Invalid type. Please choose 'jobs', 'courses', or 'all'.")
            continue

        item_type_filter = rec_type_input if rec_type_input != 'all' else None

        # --- STEP 1: Get recommendations from local QA + RAG model ---
        recommendations, llm_summary = get_qa_recommendations_with_rag(
            user_query,
            k=5,
            item_type=item_type_filter,
            num_retrieved_contexts=10
        )

        # --- Display local recommendations ---
        if not recommendations:
            print(f"No {rec_type_input} recommendations found with sufficient QA confidence from local model.")
        else:
            print("\n--- Detailed QA-Based Recommendations (Local Model) ---")
            for i, rec in enumerate(recommendations):
                print(f"\n{i+1}. Type: {rec['context_type'].capitalize()}, QA Score: {rec['score']:.4f}")
                if rec['context_type'] == 'job':
                    print(f"  Job Title: {rec['original_data'].get('Title')}")
                    print(f"  Company: {rec['original_data'].get('Company')}")
                    print(f"  Category: {rec['original_data'].get('Category')}")
                    print(f"  Location: {rec['original_data'].get('Location')}")
                    print(f"  Extracted Answer: {rec['answer']}")
                    print(f"  Responsibilities (Preview): {rec['original_data'].get('Responsibilities', '')[:150]}...")
                    print(f"  Min Qualifications (Preview): {rec['original_data'].get('Minimum Qualifications', '')[:150]}...")
                    print(f"  Pref Qualifications (Preview): {rec['original_data'].get('Preferred Qualifications', '')[:150]}...")
                elif rec['context_type'] == 'course':
                    print(f"  Course Title: {rec['original_data'].get('course_title')}")
                    print(f"  Organization: {rec['original_data'].get('course_organization')}")
                    print(f"  Certificate Type: {rec['original_data'].get('course_Certificate_type')}")
                    print(f"  Rating: {rec['original_data'].get('course_rating')}")
                    print(f"  Difficulty: {rec['original_data'].get('course_difficulty')}")
                    print(f"  Students Enrolled: {rec['original_data'].get('course_students_enrolled')}")
                    print(f"  Extracted Answer: {rec['answer']}")
                print("-" * 20)

        # --- STEP 2: Handle User Satisfaction and Gemini Fallback ---
        user_satisfaction = input("\nAre you satisfied with these recommendations? (yes/no/quit/gemini): ").strip().lower()

        if user_satisfaction == 'quit':
            log_feedback(user_query, recommendations, llm_summary, "Quit_Session_After_Local_Attempt", source="Local_QA_RAG")
            print("Exiting recommendation system. Goodbye!")
            break
        elif user_satisfaction == 'yes':
            log_feedback(user_query, recommendations, llm_summary, "Satisfied", source="Local_QA_RAG")
            print("Great! Glad I could help.")
        elif user_satisfaction == 'no' or user_satisfaction == 'gemini' or not recommendations:
            # If user is not satisfied OR no local recommendations were found
            if gemini_model:
                print("\nInitiating fallback to Gemini for recommendations...")
                gemini_output = get_gemini_recommendations(user_query, item_type=item_type_filter)
                print("\n--- Recommendations from Gemini (Fallback) ---")
                print(gemini_output)

                final_feedback_after_gemini = input("\nWas Gemini's response helpful? (yes/no/quit): ").strip().lower()
                log_feedback(user_query, [], gemini_output, final_feedback_after_gemini, source="Gemini_Fallback") # Log Gemini response
                if final_feedback_after_gemini == 'quit':
                    print("Exiting recommendation system. Goodbye!")
                    break
            else:
                print("\nGemini fallback not available. Please configure your GOOGLE_API_KEY.")
                log_feedback(user_query, recommendations, llm_summary, "Not_Satisfied_No_Gemini_Fallback", source="Local_QA_RAG")
        else:
            # Catch any other specific feedback if 'no' or 'gemini' wasn't explicit
            log_feedback(user_query, recommendations, llm_summary, user_satisfaction, source="Local_QA_RAG")
            print("Thanks for your feedback!")

        print("\n" + "="*80 + "\n") # Separator for next interaction

This feature will be disabled. Recommendations will be direct QA results (and Gemini fallback).
Gemini API configured successfully for fallback.
New datasets (job_skills.csv, coursea_data.csv) loaded successfully for inference contexts.
SentenceTransformer (embedding) model loaded successfully for RAG.
Building contexts for FAISS index...
Encoding 2141 contexts for FAISS...
Using device for embedding: cuda


Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Device set to use cuda:0


FAISS index built.
Fine-tuned QA model loaded successfully.

Starting Job and Course Recommendation System (Fine-tuned QA + RAG + Optional LLM Summary + Gemini Fallback).
Type 'quit' at any prompt to exit.

Enter your query (e.g., 'Google Cloud Program Manager responsibilities', 'courses on data science', 'Django developer jobs'): Recommend me Django Developer Jobs
Which type of recommendation? (jobs/courses/all): all

Searching for: 'Recommend me Django Developer Jobs'

--- Detailed QA-Based Recommendations (Local Model) ---

1. Type: Job, QA Score: 0.9999
  Job Title: Quantitative Analyst, Ads Quality
  Company: Google
  Category: Product & Customer Support
  Location: Zürich, Switzerland
  Extracted Answer: Quantitative Analyst, Ads Quality
  Responsibilities (Preview): Apply advanced statistical methods and work with large, complex data sets.
Solve difficult, non-routine challenges, and clearly communicate highly tec...
  Min Qualifications (Preview): PhD in Statistics or Econometr

In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1


In [16]:
import os
from google.colab import userdata
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')