In [2]:
import pandas as pd
from openai import OpenAI
import os
import google.generativeai as genai
import json
from dotenv import load_dotenv

In [3]:
# Load the keys from .env file
load_dotenv()

# Access the keys
openai_key = os.getenv("OPENAI_KEY")
gemini_key = os.getenv("GEMINI_KEY")

In [4]:
openai_client = OpenAI(api_key=openai_key)
genai.configure(api_key=gemini_key)

In [5]:
ANSWER_RESPONSE_FORMAT = """#Senior-Level Interviewee Answer **

You are a **senior-level candidate** interviewing for a **Data Science / Machine Learning / Artificial Intelligence** role at a top tech company. You will be  asked a question on a specific topic—provided to you in the format below:

Example Question
```json
{
  "topic": "Learning Rate Scheduling",
  "question": "Can you explain the concept of learning rate scheduling and why it is important in training neural networks?",
  "response_guideline": "A good answer should define learning rate scheduling as a technique to adjust the learning rate during training to improve convergence, prevent oscillations, and escape local minima. The candidate should discuss its impact on training dynamics and overall optimization quality."
}
```

Your task is to respond with:

1. **Best Answer (Technical Detail)**  
   - Provide a **comprehensive explanation** of the topic.  
   - Incorporate **mathematical notations, formulas, or derivations** where appropriate.  
   - Make sure for equations you use latex in Markdown Formatting. Use the following syntax: $<equation>$ for inline symbols and equations and $$<equation>$$ for block equations.
   - Ensure you cover both **basic** and **advanced** aspects, demonstrating **senior-level** knowledge.  
   - Address **why** the concept is important, common **techniques** or **variations**, and any **real-world considerations** (e.g., implementation details or corner cases).

2. **How to Narrate**  
   - Offer a concise guide on **delivering** this answer verbally in an interview.  
   - Include **communication tips** to convey expertise clearly.  
   - Suggest how to walk the interviewer through **complex or mathematical sections** without overwhelming them.

---

## **Instructions for Formatting Your Answer**
- Use **Markdown** to format your response.  
- Make sure for equations in Markdown Formatting. Use the following syntax: $<equation>$ for inline equations/symbols and $$<equation>$$ for block equations.
- Within the **Best Answer** section, feel free to use headings, bullet points, or equations as needed.  
- For the **How to Narrate** section, provide a clear, step-by-step or bulleted format that a candidate could follow verbally.
- For inline equations, use single dollar signs, e.g.1, $y = mx + b$.  eg.2, $log(h_\theta(x))$

---

## **Tone & Style**
- Keep a **professional and confident** tone—appropriate for a senior-level candidate.  
- Present **mathematical content** in a way that is accessible but still shows **deep expertise**.  
- Demonstrate an ability to connect **theoretical** and **practical** insights.

---

## **Example Structure**
```markdown
## Question: {question}
**Best Answer**  
> *Detailed technical explanation with equations, references to research, real-world examples, etc.*

**How to Narrate**  
> *Step-by-step guidance on how to articulate this to an interviewer, including pacing, emphasis, and interaction tips.*
```
---

### **Output Requirement**
Please provide your response in **Markdown**. Begin with the question, then **"Best Answer"** heading, followed by the **detailed technical** content. Then have a **"How to Narrate"** heading with speaking guidelines.


---"""

In [6]:
## Gemini Response Model

gemini_question_response_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

gemini_question_response_model = genai.GenerativeModel(
  model_name="gemini-2.0-flash",
  generation_config=gemini_question_response_config,
  system_instruction=ANSWER_RESPONSE_FORMAT,
)

def generate_gemini_response(question_obj, topic ='Machine Learning'):
    question_obj['topic'] = topic
    question_format = json.dumps(question_obj)
    print(question_format)
    chat_session = gemini_question_response_model.start_chat()
    response = chat_session.send_message(question_format)
    return response

In [7]:
output_dir_base = '/workspaces/codespaces-jupyter/output/'
output_dir = '/workspaces/codespaces-jupyter/output/quarto_content/'


question_output_dir = output_dir_base + 'questions/'
classification_dir = output_dir + 'classification/'
regression_dir = output_dir + 'regression/'
clustering_dir = output_dir + 'clustering/'
nlp_dir = output_dir + 'nlp/'
time_series_dir = output_dir + 'time_series/'
optimisation_dir = output_dir + 'optimisation/'
neural_networks_dir = output_dir + 'neural_networks/'

In [12]:

classification_topics = [
    'Logistic Regression',
    'L1 and L2 Regularization',
    'Decision Trees',
    'Random Forest',
    'Gradient Boosting',
    'XGBoost',
    'Support Vector Machines',
    'Naive Bayes',
    'K-Nearest Neighbours',
    'Ensemble Learning',
    'Evaluation Metrics for Classification',
    'Imbalanced Data Handling',
    'Hyperparameter Tuning for Classification',
    'Feature Selection for Classification',
    'Model Evaluation and Selection for Classification',
    'Bayesian Knowledge Tracing',
    'Recommender Systems',
    'Matrix Factorization',
    'Collaborative Filtering',
    'IRT (Item Response Theory)'
]


In [13]:
for topic in classification_topics[:1]:
    print(f"Working on topic: {topic}")
    topic_str  = topic.replace(' ', '_')
    generated_questions = json.load(open(f"{question_output_dir}{topic_str}.json",'r'))
    print(f"Topic {topic} has {len(generated_questions['questions'])} questions: ")
    for index, cur_question in enumerate(generated_questions['questions']):
        print(f" Working on question {cur_question['question']}")
        #response = f" Got index {index} Question : {cur_question['question']}"
        response = generate_gemini_response(cur_question, topic=topic)
        print("Done generating response for question")
        print("---------------------")
        os.makedirs(classification_dir + f'{topic_str}/', exist_ok=True)
        output_file_name = classification_dir + f'{topic_str}/{topic_str}_{index}.qmd'
        print(output_file_name)
        with open(output_file_name, 'w') as f:
            f.write(response.text)



Working on topic: Logistic Regression
Topic Logistic Regression has 13 questions: 
 Working on question 1. Can you provide a high-level overview of logistic regression and explain why the logistic (sigmoid) function is used in place of a linear function in binary classification?
{"question": "1. Can you provide a high-level overview of logistic regression and explain why the logistic (sigmoid) function is used in place of a linear function in binary classification?", "response_guideline": "A strong answer should cover the use of logistic regression for classification tasks, describe the sigmoid function\u2019s property of mapping any real-valued input to a (0, 1) range which works as a probability, and explain that linear functions cannot directly model probabilities while the logistic function can.", "topic": "Logistic Regression"}
Done generating response for question
---------------------
/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logisti

In [10]:
output_file_name

'/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/_3.qmd'