In [1]:
import pandas as pd
from openai import OpenAI
import os
import google.generativeai as genai
import json
from dotenv import load_dotenv
from topic_categories import ml_ai_ds_topics

In [2]:
# Load the keys from .env file
load_dotenv()

# Access the keys
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")

In [3]:
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

In [4]:
QUESTION_GENERATION_SYSTEM_PROMPT = """
## **Role & Objective**
You are an expert AI researcher at a top Silicon Valley tech company. You need to interview **principal-level candidates** for roles in **AI, Data Science, or Machine Learning**. You will be given a specific topic in this domain.

---

## **Task**
- **Generate a list of 10-15 interview questions** that thoroughly test the candidate’s **depth, mathematical understanding, and nuanced thinking** on the given topic.

---

## **Content Requirements**
1. **Question Complexity:**  
   - Include a mix of **basic, intermediate, and advanced** questions.  

2. **Guidelines for Evaluation:**  
   - For each question, provide a short **"Guideline"** describing what a good answer should cover or how it should be evaluated.  

3. **Edge Cases:**  
   - Ensure that **corner cases** and potential **pitfalls** related to the topic are covered.  

4. **Practical Application:**  
   - Include at least one question that tests the candidate’s ability to handle **real-world challenges**, such as:  
     - **Messy data** handling  
     - **Scalability** issues  
     - **Deployment considerations**  

---

## **Final Output**
Make sure the final list of questions is:
- **Comprehensive** (covering all key aspects of the topic)
- **Well-structured** (clearly formatted for easy understanding)
- **Aligned with principal-level reasoning and expectations** """


In [5]:
QUESTION_GENERATION_SYSTEM_PROMPT_TEMPLATE = """
## **Role & Objective**
You are an expert AI researcher at a top Silicon Valley tech company. You need to interview **principal-level candidates** for roles in **AI, Data Science, or Machine Learning**. You will be given a specific topic in this domain.

---

## **Task**
- **Generate a list of {num_question_string} interview questions** that thoroughly test the candidate’s **depth, mathematical understanding, and nuanced thinking** on the given topic.

---

## **Content Requirements**
1. **Question Complexity:**  
   - Include a mix of **basic, intermediate, and advanced** questions.  

2. **Guidelines for Evaluation:**  
   - For each question, provide a short **"Guideline"** describing what a good answer should cover or how it should be evaluated.  

3. **Edge Cases:**  
   - Ensure that **corner cases** and potential **pitfalls** related to the topic are covered.  

4. **Practical Application:**  
   - Include at least one question that tests the candidate’s ability to handle **real-world challenges**, such as:  
     - **Messy data** handling  
     - **Scalability** issues  
     - **Deployment considerations**  

---

## **Final Output**
Make sure the final list of questions is:
- **Comprehensive** (covering all key aspects of the topic)
- **Well-structured** (clearly formatted for easy understanding)
- **Aligned with principal-level reasoning and expectations** """


In [6]:
#print(QUESTION_GENERATION_SYSTEM_PROMPT_TEMPLATE.format(num_question_string="4-8"))

In [7]:
QUESTION_GENERATION_RESPONSE_FORMAT = {
    "type": "json_schema",
    "json_schema": {
      "name": "questions_response",
      "strict": True,
      "schema": {
        "type": "object",
        "properties": {
          "questions": {
            "type": "array",
            "description": "An array of question objects.",
            "items": {
              "type": "object",
              "properties": {
                "question": {
                  "type": "string",
                  "description": "The question text."
                },
                "response_guideline": {
                  "type": "string",
                  "description": "Guidelines for responding to the question."
                }
              },
              "required": [
                "question",
                "response_guideline"
              ],
              "additionalProperties": False
            }
          }
        },
        "required": [
          "questions"
        ],
        "additionalProperties": False
      }
    }
  }

In [8]:
## Function to generate questions
## this function takes a ML topic as input and generates a list of questions and guideline on how to solve it as output


def generate_questions(topic, num_questions='10-15'):
    response = openai_client.chat.completions.create(
      model="o3-mini", ## o3-mini
      messages=[
        {
          "role": "system",
          "content": [
            {
              "type": "text",
              "text": QUESTION_GENERATION_SYSTEM_PROMPT_TEMPLATE.format(num_question_string=num_questions)
            }
          ]
        },
        {
          "role": "user",
          "content": [
            {
              "type": "text",
              "text": f"Topic: {topic}"
            }
          ]
        }
      ],
      response_format=QUESTION_GENERATION_RESPONSE_FORMAT,
      #temperature=0.8,
      max_completion_tokens=10000,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0
    )

    return json.loads(response.choices[0].message.content)

In [9]:
#generate_questions("AutoEncoder", num_questions='1 or 2')

In [15]:
#output_dir = '/workspaces/codespaces-jupyter/output/'
output_dir = '/teamspace/studios/this_studio/gitrepo/aurora/Aurora/output/'

question_output_dir = output_dir + 'questions/'
classification_dir = output_dir + 'classification/'
regression_dir = output_dir + 'regression/'
clustering_dir = output_dir + 'clustering/'
nlp_dir = output_dir + 'nlp/'
time_series_dir = output_dir + 'time_series/'
optimisation_dir = output_dir + 'optimisation/'
neural_networks_dir = output_dir + 'neural_networks/'
transformer_networks_dir = output_dir + 'transformer_networks/'
ml_ops = output_dir + 'ml_ops/'
auto_var_gan_enc_dir = output_dir + 'auto_var_gan_enc/'

classification_topics = [
    'Logistic Regression',
    'L1 and L2 Regularization',
    'Decision Trees',
    'Random Forest',
    'Gradient Boosting',
    'XGBoost',
    'Support Vector Machines',
    'Naive Bayes',
    'K-Nearest Neighbours',
    'Ensemble Learning',
    'Evaluation Metrics for Classification',
    'Imbalanced Data Handling',
    'Hyperparameter Tuning for Classification',
    'Feature Selection for Classification',
    'Model Evaluation and Selection for Classification',
    'Bayesian Knowledge Tracing',
    'Recommender Systems',
    'Matrix Factorization',
    'Collaborative Filtering',
    'IRT (Item Response Theory)'
]


transformer_topics = [
    "Historical context and evolution of the Transformer architecture",
    "Key differences between RNN, CNN-based models and Transformers",
    "Encoder-Decoder structure in Transformers",
    "Attention mechanism (Self-Attention, Multi-Head Attention)",
    "Positional encodings and why they are needed",
    "Training dynamics (masking, batch sizes, learning rates)",
    "Scaling laws and model sizes",
    "Popular Transformer variants (BERT, GPT, T5, XLNet, etc.)",
    "Pretraining objectives (Masked LM, Next Sentence Prediction, etc.)",
    "Transfer learning and fine-tuning strategies",
    "Handling long sequences (Longformer, Big Bird, etc.)",
    "Efficient Transformers (memory and computational optimizations)",
    "Practical considerations (tokenization, hardware acceleration, libraries)",
    "Prompt engineering and in-context learning"
]
clustering_topics = [
    'K-Means Clustering',
    'Hierarchical Clustering',
    'DBSCAN',
    'Gaussian Mixture Models (GMM)',
    'HDBSCAN',
    'Cluster Evaluation Metrics (Silhouette Score, etc.)',
    'Agglomerative Clustering',
    'Mean-Shift Clustering'
]

regression_topics = [
    'Linear Regression',
    'Polynomial Regression',
    'Ridge Regression',
    'Lasso Regression',
    'Elastic Net Regression',
    'Support Vector Regression',
    'Decision Tree Regression',
    'Random Forest Regression',
    'Gradient Boosting Regression',
    'XGBoost Regression',
    'Evaluation Metrics for Regression',
    'Hyperparameter Tuning for Regression',
    'Feature Selection for Regression',
    'Model Evaluation and Selection for Regression'
]


optimisation_topics = [
    'Gradient Descent',
    'Stochastic Gradient Descent',
    'Mini-Batch Gradient Descent',
    'Momentum',
    'Nesterov Accelerated Gradient',
    'Adagrad',
    'RMSprop',
    'Adam, AdaMax, AdamW',
    'Learning Rate Scheduling and Hyperparameter Tuning for Optimisation' #    'Hyperparameter Tuning for Optimisation'
]


In [16]:
# import re 
# for topic in classification_topics[5:9]:
#     base_topic = 'classification'
#     print(f"Working for topic: {topic}")
#     questions = generate_questions(topic)
#     print(f"Generated  {len(questions['questions'])} questions for {topic}")
#     topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
#     output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
#     print(f"Dumping to {output_json_filename}")
#     json.dump( questions, open( output_json_filename, 'w' ) , indent=4)


In [17]:
# import re 
# for topic in clustering_topics:
#     base_topic = 'clustering'
#     print(f"Working for topic: {topic}")
#     questions = generate_questions(topic)
#     print(f"Generated  {len(questions['questions'])} questions for {topic}")
#     topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
#     output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
#     print(f"Dumping to {output_json_filename}")
#     json.dump( questions, open( output_json_filename, 'w' ) , indent=4)


In [18]:
ml_ops_topics = ml_ai_ds_topics['mlops_and_model_deployment']
ml_ops_topics

['ML Pipelines (Airflow, Kubeflow)',
 'Containerization (Docker) and Orchestration (Kubernetes)',
 'Cloud Platforms (AWS, GCP, Azure) for ML',
 'Continuous Integration/Continuous Deployment (CI/CD) in ML',
 'Model Serving (Flask, FastAPI, TensorFlow Serving, TorchServe)',
 'Monitoring and Logging for Deployed Models',
 'A/B Testing, Canary Deployments',
 'Feature Stores & Data Serving',
 'Model Versioning and Governance',
 'Tools & Frameworks in the MLOps Ecosystem',
 'Model Performance Metrics in Production',
 'Explainability & Interpretability in Production',
 'Model Monitoring & Drift Detection']

In [19]:
auto_var_gan_topic = ml_ai_ds_topics['autoencoder_var_autoencoder_gan_topics']

In [20]:
import re 
for topic in auto_var_gan_topic:
    base_topic = 'auto_var_gan_enc'
    print(f"Working for topic: {topic}")
    questions = generate_questions(topic,  num_questions='4-8')
    print(f"Generated  {len(questions['questions'])} questions for {topic}")
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    print(f"Dumping to {output_json_filename}")
    json.dump( questions, open( output_json_filename, 'w' ) , indent=4)


Working for topic: Autoencoders


Generated  8 questions for Autoencoders
Dumping to /teamspace/studios/this_studio/gitrepo/aurora/Aurora/output/questions/auto_var_gan_enc_Autoencoders.json
Working for topic: Variational Autoencoders (VAEs)
Generated  6 questions for Variational Autoencoders (VAEs)
Dumping to /teamspace/studios/this_studio/gitrepo/aurora/Aurora/output/questions/auto_var_gan_enc_Variational_Autoencoders__VAEs_.json
Working for topic: Generative Adversarial Networks (GANs)
Generated  6 questions for Generative Adversarial Networks (GANs)
Dumping to /teamspace/studios/this_studio/gitrepo/aurora/Aurora/output/questions/auto_var_gan_enc_Generative_Adversarial_Networks__GANs_.json
Working for topic: Conditional GANs
Generated  6 questions for Conditional GANs
Dumping to /teamspace/studios/this_studio/gitrepo/aurora/Aurora/output/questions/auto_var_gan_enc_Conditional_GANs.json
Working for topic: CycleGAN, StyleGAN, etc.
Generated  7 questions for CycleGAN, StyleGAN, etc.
Dumping to /teamspace/studios/this_st

In [31]:
import re 
for topic in optimisation_topics:
    base_topic = 'optimisation'
    print(f"Working for topic: {topic}")
    questions = generate_questions(topic,  num_questions='4-8')
    print(f"Generated  {len(questions['questions'])} questions for {topic}")
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    print(f"Dumping to {output_json_filename}")
    json.dump( questions, open( output_json_filename, 'w' ) , indent=4)


Working for topic: Gradient Descent
Generated  6 questions for Gradient Descent
Dumping to /workspaces/codespaces-jupyter/output/questions/optimisation_Gradient_Descent.json
Working for topic: Stochastic Gradient Descent
Generated  6 questions for Stochastic Gradient Descent
Dumping to /workspaces/codespaces-jupyter/output/questions/optimisation_Stochastic_Gradient_Descent.json
Working for topic: Mini-Batch Gradient Descent
Generated  5 questions for Mini-Batch Gradient Descent
Dumping to /workspaces/codespaces-jupyter/output/questions/optimisation_Mini_Batch_Gradient_Descent.json
Working for topic: Momentum
Generated  5 questions for Momentum
Dumping to /workspaces/codespaces-jupyter/output/questions/optimisation_Momentum.json
Working for topic: Nesterov Accelerated Gradient
Generated  6 questions for Nesterov Accelerated Gradient
Dumping to /workspaces/codespaces-jupyter/output/questions/optimisation_Nesterov_Accelerated_Gradient.json
Working for topic: Adagrad
Generated  6 questions