In [2]:
import pandas as pd
from openai import OpenAI
import os
import google.generativeai as genai
import json
from dotenv import load_dotenv

In [3]:
# Load the keys from .env file
load_dotenv()

# Access the keys
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")

In [4]:
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

In [5]:
QUESTION_GENERATION_SYSTEM_PROMPT = """
## **Role & Objective**
You are an expert AI researcher at a top Silicon Valley tech company. You need to interview **principal-level candidates** for roles in **AI, Data Science, or Machine Learning**. You will be given a specific topic in this domain.

---

## **Task**
- **Generate a list of 10-15 interview questions** that thoroughly test the candidate’s **depth, mathematical understanding, and nuanced thinking** on the given topic.

---

## **Content Requirements**
1. **Question Complexity:**  
   - Include a mix of **basic, intermediate, and advanced** questions.  

2. **Guidelines for Evaluation:**  
   - For each question, provide a short **"Guideline"** describing what a good answer should cover or how it should be evaluated.  

3. **Edge Cases:**  
   - Ensure that **corner cases** and potential **pitfalls** related to the topic are covered.  

4. **Practical Application:**  
   - Include at least one question that tests the candidate’s ability to handle **real-world challenges**, such as:  
     - **Messy data** handling  
     - **Scalability** issues  
     - **Deployment considerations**  

---

## **Final Output**
Make sure the final list of questions is:
- **Comprehensive** (covering all key aspects of the topic)
- **Well-structured** (clearly formatted for easy understanding)
- **Aligned with principal-level reasoning and expectations** """

ANSWER_RESPONSE_FORMAT = """#Senior-Level Interviewee Answer **

You are a **senior-level candidate** interviewing for a **Data Science / Machine Learning / Artificial Intelligence** role at a top tech company. You will be  asked a question on a specific topic—provided to you in the format below:

Example Question
```json
{
  "topic": "Learning Rate Scheduling",
  "question": "Can you explain the concept of learning rate scheduling and why it is important in training neural networks?",
  "response_guideline": "A good answer should define learning rate scheduling as a technique to adjust the learning rate during training to improve convergence, prevent oscillations, and escape local minima. The candidate should discuss its impact on training dynamics and overall optimization quality."
}
```

Your task is to respond with:

1. **Best Answer (Technical Detail)**  
   - Provide a **comprehensive explanation** of the topic.  
   - Incorporate **mathematical notations, formulas, or derivations** where appropriate.  
   - Make sure for equations you use latex in Markdown Formatting. Use the following syntax: $<equation>$ for inline symbols and equations and $$<equation>$$ for block equations.
   - Ensure you cover both **basic** and **advanced** aspects, demonstrating **senior-level** knowledge.  
   - Address **why** the concept is important, common **techniques** or **variations**, and any **real-world considerations** (e.g., implementation details or corner cases).

2. **How to Narrate**  
   - Offer a concise guide on **delivering** this answer verbally in an interview.  
   - Include **communication tips** to convey expertise clearly.  
   - Suggest how to walk the interviewer through **complex or mathematical sections** without overwhelming them.

---

## **Instructions for Formatting Your Answer**
- Use **Markdown** to format your response.  
- Make sure for equations in Markdown Formatting. Use the following syntax: $<equation>$ for inline equations/symbols and $$<equation>$$ for block equations.
- Within the **Best Answer** section, feel free to use headings, bullet points, or equations as needed.  
- For the **How to Narrate** section, provide a clear, step-by-step or bulleted format that a candidate could follow verbally.
- For inline equations, use single dollar signs, e.g.1, $y = mx + b$.  eg.2, $log(h_\theta(x))$

---

## **Tone & Style**
- Keep a **professional and confident** tone—appropriate for a senior-level candidate.  
- Present **mathematical content** in a way that is accessible but still shows **deep expertise**.  
- Demonstrate an ability to connect **theoretical** and **practical** insights.

---

## **Example Structure**
```markdown
## Question: {question}
**Best Answer**  
> *Detailed technical explanation with equations, references to research, real-world examples, etc.*

**How to Narrate**  
> *Step-by-step guidance on how to articulate this to an interviewer, including pacing, emphasis, and interaction tips.*
```
---

### **Output Requirement**
Please provide your response in **Markdown**. Begin with the question, then **"Best Answer"** heading, followed by the **detailed technical** content. Then have a **"How to Narrate"** heading with speaking guidelines.


---"""

In [6]:
QUESTION_GENERATION_RESPONSE_FORMAT = {
    "type": "json_schema",
    "json_schema": {
      "name": "questions_response",
      "strict": True,
      "schema": {
        "type": "object",
        "properties": {
          "questions": {
            "type": "array",
            "description": "An array of question objects.",
            "items": {
              "type": "object",
              "properties": {
                "question": {
                  "type": "string",
                  "description": "The question text."
                },
                "response_guideline": {
                  "type": "string",
                  "description": "Guidelines for responding to the question."
                }
              },
              "required": [
                "question",
                "response_guideline"
              ],
              "additionalProperties": False
            }
          }
        },
        "required": [
          "questions"
        ],
        "additionalProperties": False
      }
    }
  }

In [7]:
## Gemini Response Model

gemini_question_response_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

gemini_question_response_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash",
  generation_config=gemini_question_response_config,
  system_instruction=ANSWER_RESPONSE_FORMAT,
)

In [38]:
## Function to generate questions
## this function takes a ML topic as input and generates a list of questions and guideline on how to solve it as output


def generate_questions(topic):
    response = openai_client.chat.completions.create(
      model="gpt-4o-mini", ## o3-mini
      messages=[
        {
          "role": "system",
          "content": [
            {
              "type": "text",
              "text": QUESTION_GENERATION_SYSTEM_PROMPT
            }
          ]
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"Topic: {topic}"
            }
          ]
        }
      ],
      response_format=QUESTION_GENERATION_RESPONSE_FORMAT,
      #temperature=0.8,
      max_completion_tokens=10000,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )

    return json.loads(response.choices[0].message.content)

In [39]:
output_dir = '/workspaces/codespaces-jupyter/output/'
question_output_dir = output_dir + 'questions/'
classification_dir = output_dir + 'classification/'
regression_dir = output_dir + 'regression/'
clustering_dir = output_dir + 'clustering/'
nlp_dir = output_dir + 'nlp/'
time_series_dir = output_dir + 'time_series/'
optimisation_dir = output_dir + 'optimisation/'
neural_networks_dir = output_dir + 'neural_networks/'

classification_topics = [
    'Logistic Regression',
    'L1 and L2 Regularization',
    'Decision Trees',
    'Random Forest',
    'Gradient Boosting',
    'XGBoost',
    'Support Vector Machines',
    'Naive Bayes',
    'K-Nearest Neighbours',
    'Ensemble Learning',
    'Evaluation Metrics for Classification',
    'Imbalanced Data Handling',
    'Hyperparameter Tuning for Classification',
    'Feature Selection for Classification',
    'Model Evaluation and Selection for Classification',
    'Bayesian Knowledge Tracing',
    'Recommender Systems',
    'Matrix Factorization',
    'Collaborative Filtering',
    'IRT (Item Response Theory)'
]


In [40]:
for topic in classification_topics[8:]:
    print(f"Working for topic: {topic}")
    questions = generate_questions(topic)
    print(f"Generated  {len(questions['questions'])} questions for {topic}")
    topic_str  = topic.replace(' ', '_')
    json.dump( questions, open( f"{question_output_dir}{topic_str}.json", 'w' ) )


Working for topic: K-Nearest Neighbours
Generated  15 questions for K-Nearest Neighbours
Working for topic: Ensemble Learning
Generated  14 questions for Ensemble Learning
Working for topic: Evaluation Metrics for Classification
Generated  14 questions for Evaluation Metrics for Classification
Working for topic: Imbalanced Data Handling
Generated  14 questions for Imbalanced Data Handling
Working for topic: Hyperparameter Tuning for Classification
Generated  14 questions for Hyperparameter Tuning for Classification
Working for topic: Feature Selection for Classification
Generated  15 questions for Feature Selection for Classification
Working for topic: Model Evaluation and Selection for Classification
Generated  14 questions for Model Evaluation and Selection for Classification
Working for topic: Bayesian Knowledge Tracing
Generated  13 questions for Bayesian Knowledge Tracing
Working for topic: Recommender Systems
Generated  14 questions for Recommender Systems
Working for topic: Matr

In [23]:
#qs = json.load(open("/workspaces/codespaces-jupyter/output/questions/L1_and_L2_Regularization.json",'r'))

In [26]:
classification_topics[8]

'K-Nearest Neighbours'