In [1]:
import pandas as pd
from openai import OpenAI
import os
import google.generativeai as genai
import json
from dotenv import load_dotenv
from topic_categories import ml_ai_ds_topics

In [2]:
# Load the keys from .env file
load_dotenv()

# Access the keys
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")

In [4]:
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

In [5]:
ANSWER_RESPONSE_FORMAT = """#Senior-Level Interviewee Answer **

You are a **senior-level candidate** interviewing for a **Data Science / Machine Learning / Artificial Intelligence** role at a top tech company. You will be  asked a question on a specific topic—provided to you in the format below:

Example Question
```json
{
  "topic": "Learning Rate Scheduling",
  "question": "Can you explain the concept of learning rate scheduling and why it is important in training neural networks?",
  "response_guideline": "A good answer should define learning rate scheduling as a technique to adjust the learning rate during training to improve convergence, prevent oscillations, and escape local minima. The candidate should discuss its impact on training dynamics and overall optimization quality."
}
```

Your task is to respond with:

1. **Best Answer (Technical Detail)**  
   - Provide a **comprehensive explanation** of the topic.  
   - Incorporate **mathematical notations, formulas, or derivations** where appropriate.  
   - Make sure for equations you use latex in Markdown Formatting. Use the following syntax: $<equation>$ for inline symbols and equations and $$<equation>$$ for block equations.
   - Ensure you cover both **basic** and **advanced** aspects, demonstrating **senior-level** knowledge.  
   - Address **why** the concept is important, common **techniques** or **variations**, and any **real-world considerations** (e.g., implementation details or corner cases).

2. **How to Narrate**  
   - Offer a concise guide on **delivering** this answer verbally in an interview.  
   - Include **communication tips** to convey expertise clearly.  
   - Suggest how to walk the interviewer through **complex or mathematical sections** without overwhelming them.

---

## **Instructions for Formatting Your Answer**
- Use **Markdown** to format your response.  
- Make sure for equations in Markdown Formatting. Use the following syntax: $<equation>$ for inline equations/symbols and $$<equation>$$ for block equations.
- Within the **Best Answer** section, feel free to use headings, bullet points, or equations as needed.  
- For the **How to Narrate** section, provide a clear, step-by-step or bulleted format that a candidate could follow verbally.
- For inline equations, use single dollar signs, e.g.1, $y = mx + b$.  eg.2, $log(h_\theta(x))$

---

## **Tone & Style**
- Keep a **professional and confident** tone—appropriate for a senior-level candidate.  
- Present **mathematical content** in a way that is accessible but still shows **deep expertise**.  
- Demonstrate an ability to connect **theoretical** and **practical** insights.

---

## **Example Response Structure**
## Question: text of the question
**Best Answer**  
> *Detailed technical explanation with equations, references to research, real-world examples, etc.*

---
**How to Narrate**  
> *Step-by-step guidance on how to articulate this to an interviewer, including pacing, emphasis, and interaction tips.*
```
---

### **Output Requirement**
Please provide your response in **Markdown**. Begin with the question, then **"Best Answer"** heading, followed by the **detailed technical** content. Then have a **"How to Narrate"** heading with speaking guidelines.
"""

In [6]:
## Gemini Response Model

gemini_question_response_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

gemini_question_response_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash",
  generation_config=gemini_question_response_config,
  system_instruction=ANSWER_RESPONSE_FORMAT,
)

def generate_gemini_response(question_obj, topic ='Machine Learning'):
    question_obj['topic'] = topic
    question_format = json.dumps(question_obj)
    print(question_format)
    chat_session = gemini_question_response_model.start_chat()
    response = chat_session.send_message(question_format)
    return response

In [7]:
output_dir_base = '/workspaces/codespaces-jupyter/output/'
output_dir = '/workspaces/codespaces-jupyter/output/quarto_content/'


question_output_dir = output_dir_base + 'questions/'
classification_dir = output_dir + 'classification/'
regression_dir = output_dir + 'regression/'
clustering_dir = output_dir + 'clustering/'
nlp_dir = output_dir + 'nlp/'
time_series_dir = output_dir + 'time_series/'
optimisation_dir = output_dir + 'optimisation/'
neural_networks_dir = output_dir + 'neural_networks/'
transformer_networks_dir = output_dir + 'transformer_networks/'
ml_ops = output_dir + 'ml_ops/'

In [8]:

# classification_topics = [
#     'Logistic Regression',
#     'L1 and L2 Regularization',
#     'Decision Trees',
#     'Random Forest',
#     'Gradient Boosting',
#     'XGBoost',
#     'Support Vector Machines',
#     'Naive Bayes',
#     'K-Nearest Neighbours',
#     'Ensemble Learning',
#     'Evaluation Metrics for Classification',
#     'Imbalanced Data Handling',
#     'Hyperparameter Tuning for Classification',
#     'Feature Selection for Classification',
#     'Model Evaluation and Selection for Classification',
#     'Bayesian Knowledge Tracing',
#     'Recommender Systems',
#     'Matrix Factorization',
#     'Collaborative Filtering',
#     'IRT (Item Response Theory)'
# ]


In [9]:
# for topic in classification_topics[5:9]:
#     print(f"Working on topic: {topic}")
#     topic_str  = topic.replace(' ', '_')
#     generated_questions = json.load(open(f"{question_output_dir}{topic_str}.json",'r'))
#     print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
#     for index, cur_question in enumerate(generated_questions['questions']):
#         print(f" Working on question {cur_question['question']}")
#         #response = f" Got index {index} Question : {cur_question['question']}"
#         response = generate_gemini_response(cur_question, topic=topic)
#         print("Done generating response for question")
#         print("---------------------")
#         os.makedirs(classification_dir + f'{topic_str}/', exist_ok=True)
#         output_file_name = classification_dir + f'{topic_str}/{topic_str}_{index}.qmd'
#         print(output_file_name)
#         with open(output_file_name, 'w') as f:
#             f.write(response.text)



### Working on transformers now

In [9]:
# import re 
# transformer_topics = [
#     "Historical context and evolution of the Transformer architecture",
#     "Key differences between RNN, CNN-based models and Transformers",
#     "Encoder-Decoder structure in Transformers",
#     "Attention mechanism (Self-Attention, Multi-Head Attention)",
#     "Positional encodings and why they are needed",
#     "Training dynamics (masking, batch sizes, learning rates)",
#     "Scaling laws and model sizes",
#     "Popular Transformer variants (BERT, GPT, T5, XLNet, etc.)",
#     "Pretraining objectives (Masked LM, Next Sentence Prediction, etc.)",
#     "Transfer learning and fine-tuning strategies",
#     "Handling long sequences (Longformer, Big Bird, etc.)",
#     "Efficient Transformers (memory and computational optimizations)",
#     "Practical considerations (tokenization, hardware acceleration, libraries)",
#     "Prompt engineering and in-context learning"
# ]


In [10]:
# import re 
# for topic in transformer_topics:
#     base_topic = 'transformers'
#     topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
#     print(f"Working for topic: {base_topic}_{topic}")
#     output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
#     generated_questions = json.load(open(output_json_filename,'r'))
#     print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
#     for index, cur_question in enumerate(generated_questions['questions']):
#         print(f" Working on question {cur_question['question']}")
#         #response = f" Got index {index} Question : {cur_question['question']}"
#         response = generate_gemini_response(cur_question, topic=topic)
#         print("Done generating response for question")
#         print("---------------------")
#         os.makedirs(transformer_networks_dir + f'{topic_str}/', exist_ok=True)
#         output_file_name = transformer_networks_dir + f'{topic_str}/{topic_str}_{index}.qmd'
#         print(output_file_name)
#         with open(output_file_name, 'w') as f:
#             f.write(response.text)

    

    


In [20]:
base_topic = 'clustering'
topic_categories = ml_ai_ds_topics[f"{base_topic}_topics"]
output_directory = clustering_dir
#topic_categories

In [22]:
import re 
for topic in topic_categories[1:]:
    #base_topic = 'transformers'
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    print(f"Working for topic: {base_topic}_{topic}")
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    generated_questions = json.load(open(output_json_filename,'r'))
    print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
    for index, cur_question in enumerate(generated_questions['questions']):
        print(f" Working on question {cur_question['question']}")
        response = f" Got index {index} Question : {cur_question['question']}"
        response = generate_gemini_response(cur_question, topic=topic)
        print("Done generating response for question")
        print("---------------------")
        os.makedirs(output_directory + f'{topic_str}/', exist_ok=True)
        output_file_name = output_directory + f'{topic_str}/{topic_str}_{index}.qmd'
        print(output_file_name)
        with open(output_file_name, 'w') as f:
            f.write(response.text)



Working for topic: clustering_Hierarchical Clustering
Topic Hierarchical Clustering has 13 questions: 
 Working on question 1. What is hierarchical clustering, and what are its two main types?
{"question": "1. What is hierarchical clustering, and what are its two main types?", "response_guideline": "A strong answer should define hierarchical clustering, distinguish between agglomerative (bottom-up) and divisive (top-down) approaches, and briefly mention scenarios or advantages and disadvantages for each.", "topic": "Hierarchical Clustering"}
Done generating response for question
---------------------
/workspaces/codespaces-jupyter/output/quarto_content/clustering/Hierarchical_Clustering/Hierarchical_Clustering_0.qmd
 Working on question 2. Explain the difference between agglomerative and divisive hierarchical clustering. When might one be preferred over the other?
{"question": "2. Explain the difference between agglomerative and divisive hierarchical clustering. When might one be prefe

In [12]:
base_topic = 'optimisation'
topic_categories = ml_ai_ds_topics[f"{base_topic}_topics"]
output_directory = optimisation_dir
topic_categories

['Gradient Descent',
 'Stochastic Gradient Descent',
 'Mini-Batch Gradient Descent',
 'Momentum',
 'Nesterov Accelerated Gradient',
 'Adagrad',
 'RMSprop',
 'Adam, AdaMax, AdamW',
 'Learning Rate Scheduling and Hyperparameter Tuning for Optimisation']

In [13]:
import re 
for topic in topic_categories:
    #base_topic = 'transformers'
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    print(f"Working for topic: {base_topic}_{topic}")
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    generated_questions = json.load(open(output_json_filename,'r'))
    print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
    for index, cur_question in enumerate(generated_questions['questions']):
        print(f" Working on question {cur_question['question']}")
        response = f" Got index {index} Question : {cur_question['question']}"
        response = generate_gemini_response(cur_question, topic=topic)
        print("Done generating response for question")
        print("---------------------")
        os.makedirs(output_directory + f'{topic_str}/', exist_ok=True)
        output_file_name = output_directory + f'{topic_str}/{topic_str}_{index}.qmd'
        print(output_file_name)
        with open(output_file_name, 'w') as f:
            f.write(response.text)



Working for topic: optimisation_Gradient Descent
Topic Gradient Descent has 6 questions: 
 Working on question 1. Can you explain the basic intuition behind gradient descent and how it is used to minimize a cost function in machine learning models?
{"question": "1. Can you explain the basic intuition behind gradient descent and how it is used to minimize a cost function in machine learning models?", "response_guideline": "A good answer should describe the notion of using the gradient to iteratively update parameters in the opposite direction of the slope, mention the significance of the learning rate, and ideally reference common pitfalls like overshooting minima and getting stuck in local optima.", "topic": "Gradient Descent"}
Done generating response for question
---------------------
/workspaces/codespaces-jupyter/output/quarto_content/optimisation/Gradient_Descent/Gradient_Descent_0.qmd
 Working on question 2. How does the choice of learning rate affect the convergence of gradient 

In [10]:
base_topic = 'ml_ops'
topic_categories = ml_ai_ds_topics["mlops_and_model_deployment"]
output_directory = ml_ops
topic_categories

['ML Pipelines (Airflow, Kubeflow)',
 'Containerization (Docker) and Orchestration (Kubernetes)',
 'Cloud Platforms (AWS, GCP, Azure) for ML',
 'Continuous Integration/Continuous Deployment (CI/CD) in ML',
 'Model Serving (Flask, FastAPI, TensorFlow Serving, TorchServe)',
 'Monitoring and Logging for Deployed Models',
 'A/B Testing, Canary Deployments',
 'Feature Stores & Data Serving',
 'Model Versioning and Governance',
 'Tools & Frameworks in the MLOps Ecosystem',
 'Model Performance Metrics in Production',
 'Explainability & Interpretability in Production',
 'Model Monitoring & Drift Detection']

In [11]:
import re 
for topic in topic_categories:
    #base_topic = 'transformers'
    topic_str = re.sub(r'[^a-zA-Z0-9]', '_', topic)
    print(f"Working for topic: {base_topic}_{topic}")
    output_json_filename =  f"{question_output_dir}{base_topic}_{topic_str}.json"
    generated_questions = json.load(open(output_json_filename,'r'))
    print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
    for index, cur_question in enumerate(generated_questions['questions']):
        print(f" Working on question {cur_question['question']}")
        response = f" Got index {index} Question : {cur_question['question']}"
        response = generate_gemini_response(cur_question, topic=topic)
        print("Done generating response for question")
        print("---------------------")
        os.makedirs(output_directory + f'{topic_str}/', exist_ok=True)
        output_file_name = output_directory + f'{topic_str}/{topic_str}_{index}.qmd'
        print(output_file_name)
        with open(output_file_name, 'w') as f:
            f.write(response.text)



Working for topic: ml_ops_ML Pipelines (Airflow, Kubeflow)
Topic ML Pipelines (Airflow, Kubeflow) has 7 questions: 
 Working on question 1. What is the role of ML pipelines in production machine learning systems, and why are tools like Airflow and Kubeflow critical for managing these pipelines?
{"question": "1. What is the role of ML pipelines in production machine learning systems, and why are tools like Airflow and Kubeflow critical for managing these pipelines?", "response_guideline": "A strong answer should cover how ML pipelines help in structuring, automating, tracking, and scaling ML workflows. The candidate should mention components such as data ingestion, preprocessing, training, evaluation, and deployment. They should also discuss how Airflow and Kubeflow aid in orchestration and reproducibility, and possibly point out challenges in production environments such as dependency management and version control.", "topic": "ML Pipelines (Airflow, Kubeflow)"}
Done generating respons