In [13]:
import pandas as pd
from openai import OpenAI
import os
import google.generativeai as genai
import json
from dotenv import load_dotenv

In [14]:
# Load the keys from .env file
load_dotenv()

# Access the keys
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")

In [15]:
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

In [16]:
QUESTION_GENERATION_SYSTEM_PROMPT = """
## **Role & Objective**
You are an expert AI researcher at a top Silicon Valley tech company. You need to interview **principal-level candidates** for roles in **AI, Data Science, or Machine Learning**. You will be given a specific topic in this domain.

---

## **Task**
- **Generate a list of 10-15 interview questions** that thoroughly test the candidate’s **depth, mathematical understanding, and nuanced thinking** on the given topic.

---

## **Content Requirements**
1. **Question Complexity:**  
   - Include a mix of **basic, intermediate, and advanced** questions.  

2. **Guidelines for Evaluation:**  
   - For each question, provide a short **"Guideline"** describing what a good answer should cover or how it should be evaluated.  

3. **Edge Cases:**  
   - Ensure that **corner cases** and potential **pitfalls** related to the topic are covered.  

4. **Practical Application:**  
   - Include at least one question that tests the candidate’s ability to handle **real-world challenges**, such as:  
     - **Messy data** handling  
     - **Scalability** issues  
     - **Deployment considerations**  

---

## **Final Output**
Make sure the final list of questions is:
- **Comprehensive** (covering all key aspects of the topic)
- **Well-structured** (clearly formatted for easy understanding)
- **Aligned with principal-level reasoning and expectations** """


In [17]:
QUESTION_GENERATION_RESPONSE_FORMAT = {
    "type": "json_schema",
    "json_schema": {
      "name": "questions_response",
      "strict": True,
      "schema": {
        "type": "object",
        "properties": {
          "questions": {
            "type": "array",
            "description": "An array of question objects.",
            "items": {
              "type": "object",
              "properties": {
                "question": {
                  "type": "string",
                  "description": "The question text."
                },
                "response_guideline": {
                  "type": "string",
                  "description": "Guidelines for responding to the question."
                }
              },
              "required": [
                "question",
                "response_guideline"
              ],
              "additionalProperties": False
            }
          }
        },
        "required": [
          "questions"
        ],
        "additionalProperties": False
      }
    }
  }

In [18]:
## Function to generate questions
## this function takes a ML topic as input and generates a list of questions and guideline on how to solve it as output


def generate_questions(topic):
    response = openai_client.chat.completions.create(
      model="o3-mini", ## o3-mini
      messages=[
        {
          "role": "system",
          "content": [
            {
              "type": "text",
              "text": QUESTION_GENERATION_SYSTEM_PROMPT
            }
          ]
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"Topic: {topic}"
            }
          ]
        }
      ],
      response_format=QUESTION_GENERATION_RESPONSE_FORMAT,
      #temperature=0.8,
      max_completion_tokens=10000,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )

    return json.loads(response.choices[0].message.content)

In [20]:
output_dir = '/workspaces/codespaces-jupyter/output/'
question_output_dir = output_dir + 'questions/'
classification_dir = output_dir + 'classification/'
regression_dir = output_dir + 'regression/'
clustering_dir = output_dir + 'clustering/'
nlp_dir = output_dir + 'nlp/'
time_series_dir = output_dir + 'time_series/'
optimisation_dir = output_dir + 'optimisation/'
neural_networks_dir = output_dir + 'neural_networks/'
transformer_networks_dir = output_dir + 'transformer_networks/'

classification_topics = [
    'Logistic Regression',
    'L1 and L2 Regularization',
    'Decision Trees',
    'Random Forest',
    'Gradient Boosting',
    'XGBoost',
    'Support Vector Machines',
    'Naive Bayes',
    'K-Nearest Neighbours',
    'Ensemble Learning',
    'Evaluation Metrics for Classification',
    'Imbalanced Data Handling',
    'Hyperparameter Tuning for Classification',
    'Feature Selection for Classification',
    'Model Evaluation and Selection for Classification',
    'Bayesian Knowledge Tracing',
    'Recommender Systems',
    'Matrix Factorization',
    'Collaborative Filtering',
    'IRT (Item Response Theory)'
]


transformer_topics = [
    "Historical context and evolution of the Transformer architecture",
    "Key differences between RNN, CNN-based models and Transformers",
    "Encoder-Decoder structure in Transformers",
    "Attention mechanism (Self-Attention, Multi-Head Attention)",
    "Positional encodings and why they are needed",
    "Training dynamics (masking, batch sizes, learning rates)",
    "Scaling laws and model sizes",
    "Popular Transformer variants (BERT, GPT, T5, XLNet, etc.)",
    "Pretraining objectives (Masked LM, Next Sentence Prediction, etc.)",
    "Transfer learning and fine-tuning strategies",
    "Handling long sequences (Longformer, Big Bird, etc.)",
    "Efficient Transformers (memory and computational optimizations)",
    "Practical considerations (tokenization, hardware acceleration, libraries)",
    "Prompt engineering and in-context learning"
]
clustering_topics = [
    'K-Means Clustering',
    'Hierarchical Clustering',
    'DBSCAN',
    'Gaussian Mixture Models (GMM)',
    'HDBSCAN',
    'Cluster Evaluation Metrics (Silhouette Score, etc.)',
    'Agglomerative Clustering',
    'Mean-Shift Clustering'
]


In [21]:
import re 
for topic in clustering_topics:
    base_topic = 'clustering'
    print(f"Working for topic: {topic}")
    questions = generate_questions(topic)
    print(f"Generated  {len(questions['questions'])} questions for {topic}")
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    print(f"Dumping to {output_json_filename}")
    json.dump( questions, open( output_json_filename, 'w' ) , indent=4)


Working for topic: K-Means Clustering
Generated  13 questions for K-Means Clustering
Dumping to /workspaces/codespaces-jupyter/output/questions/clustering_K_Means_Clustering.json
Working for topic: Hierarchical Clustering
Generated  13 questions for Hierarchical Clustering
Dumping to /workspaces/codespaces-jupyter/output/questions/clustering_Hierarchical_Clustering.json
Working for topic: DBSCAN
Generated  14 questions for DBSCAN
Dumping to /workspaces/codespaces-jupyter/output/questions/clustering_DBSCAN.json
Working for topic: Gaussian Mixture Models (GMM)
Generated  12 questions for Gaussian Mixture Models (GMM)
Dumping to /workspaces/codespaces-jupyter/output/questions/clustering_Gaussian_Mixture_Models__GMM_.json
Working for topic: HDBSCAN
Generated  12 questions for HDBSCAN
Dumping to /workspaces/codespaces-jupyter/output/questions/clustering_HDBSCAN.json
Working for topic: Cluster Evaluation Metrics (Silhouette Score, etc.)
Generated  12 questions for Cluster Evaluation Metrics 

In [23]:
#qs = json.load(open("/workspaces/codespaces-jupyter/output/questions/L1_and_L2_Regularization.json",'r'))

In [17]:
#classification_topics[8]