In [3]:
import os
import re

def extract_questions():
    """
    Traverse all subdirectories of "/workspaces/codespaces-jupyter/output/quarto_content",
    look for files with a ".qmd" extension, read only their first 15 lines, and search for a
    line containing the word "question" (case-insensitive). When found, extract the text after
    the word "question" (ignoring any punctuation or delimiter right after it), trim any leading
    special characters/numbers/spaces from the captured text, and store the result.
    If no such line is found in a file, the question is recorded as an empty string.

    Returns:
        list of dict: Each dictionary contains:
            - 'file_path': The full path to the file.
            - 'question': The cleaned question string (or an empty string if not found).
    """
    base_path = "/workspaces/codespaces-jupyter/output/quarto_content"
    results = []

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file.lower().endswith(".qmd"):
                file_path = os.path.join(root, file)
                question_text = ""  # default if no question is found

                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        # Only process the first 15 lines
                        for _ in range(15):
                            line = f.readline()
                            if not line:
                                break  # Reached end of file

                            # Check if the line contains 'question' (case-insensitive)
                            if 'question' in line.lower():
                                # Extract characters after the word 'question'
                                # This regex looks for the word 'question' (ignoring case),
                                # followed by optional whitespace and delimiters like ':' or '-',
                                # then captures the rest of the line.
                                match = re.search(r'question\s*[:\-]*\s*(.*)', line, re.IGNORECASE)
                                if match:
                                    question_text = match.group(1).strip()
                                    # Remove any leading characters that are not letters (e.g., special characters, numbers, or spaces)
                                    question_text = re.sub(r'^[^A-Za-z]+', '', question_text)
                                    # Optionally, remove any other special characters from the rest of the text
                                    question_text = re.sub(r'[^A-Za-z0-9\s]', '', question_text)
                                # Stop processing further lines once the question is found
                                break
                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

                results.append({
                    'file_path': file_path,
                    'question': question_text
                })
                
    return results

# # Example usage:
# if __name__ == "__main__":
#     extracted_questions = extract_questions()
#     for item in extracted_questions:
#         print(item)


In [4]:
extracted_questions = extract_questions()
print(f"Number of files {len(extracted_questions)}")
empty_question_files = [entry for entry in extracted_questions if not entry['question']]
print(f"Files with no questions {len(empty_question_files)}")


Number of files 445
Files with no questions 1


In [5]:
[i for i in extracted_questions if 'logistic' in i['file_path'].lower()][:2]

[{'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd',
  'question': 'Logistic regression models produce probabilities for binary outcomes How would you calibrate these probabilities if you suspect that they are poorly calibrated and why is calibration important'},
 {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_6.qmd',
  'question': 'Logistic regression is based on certain assumptions What are these assumptions and how can violations of these assumptions affect model performance'}]

In [6]:
# import os

# def generate_index_qmd(extracted_questions, output_file="/workspaces/codespaces-jupyter/index.qmd"):
#     """
#     Generate a Quarto index.qmd file using the extracted_questions list.

#     The file groups questions by category and subcategory.
#     It converts an absolute file path like:
#         /workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd
#     into a relative path like:
#         output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd

#     The index.qmd file will be structured as follows:

#     ---
#     title: "Data Science interview questions to practice"
#     format:
#       html:
#         toc: true
#     ---

#     ## classification

#     ### Decision_Trees

#     - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_0.qmd)
#     - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_1.qmd)
#     - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_10.qmd)

#     ### Logistic_Regression

#     - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd)
#     - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_6.qmd)
#     - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_4.qmd)
#     """
#     # Group the entries by category and subcategory.
#     groups = {}
#     for entry in extracted_questions:
#         file_path = entry['file_path']
#         question = entry['question']
#         # Convert to relative path: remove the /workspaces/codespaces-jupyter/ prefix if present.
#         base_prefix = "/workspaces/codespaces-jupyter/"
#         if file_path.startswith(base_prefix):
#             relative_path = file_path[len(base_prefix):]
#         else:
#             relative_path = file_path

#         # Split the path into parts using the OS separator.
#         parts = file_path.split(os.sep)
#         try:
#             # Find where 'quarto_content' is located in the path
#             idx = parts.index("quarto_content")
#             # The next two parts should be the category and subcategory.
#             category = parts[idx+1] if idx+1 < len(parts) else "Unknown"
#             subcategory = parts[idx+2] if idx+2 < len(parts) else "General"
#         except ValueError:
#             # In case the expected folder name is not found
#             category, subcategory = "Unknown", "General"

#         groups.setdefault(category, {}).setdefault(subcategory, []).append({
#             "relative_path": relative_path,
#             "question": question
#         })

#     # Build the content for index.qmd
#     lines = []
#     # YAML front matter
#     lines.append("---")
#     lines.append('title: "Data Science interview questions to practice"')
#     lines.append("format:")
#     lines.append("  html:")
#     lines.append("    toc: true")
#     lines.append("---")
#     lines.append("")  # Blank line

#     # Create the Markdown sections for each category and subcategory.
#     for category in sorted(groups.keys()):
#         lines.append(f"## {category}")
#         lines.append("")  # Blank line
#         for subcategory in sorted(groups[category].keys()):
#             lines.append(f"### {subcategory}")
#             lines.append("")  # Blank line
#             for item in groups[category][subcategory]:
#                 relative_path = item["relative_path"]
#                 question_text = item["question"]
#                 # Use a placeholder if question_text is empty.
#                 display_text = question_text if question_text else "No question text available"
#                 # Add the bullet list item.
#                 lines.append(f"- [{display_text}]({relative_path})")
#             lines.append("")  # Blank line after each subcategory
#         lines.append("")  # Blank line after each category

#     # Write the generated content to the output file.
#     with open(output_file, "w", encoding="utf-8") as f:
#         f.write("\n".join(lines))

#     print(f"Index file generated: {output_file}")

# # # Example usage:
# # if __name__ == "__main__":
# #     # Suppose extracted_questions is your list from the previous function.
# #     extracted_questions = [
# #         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd',
# #          'question': 'Logistic regression models produce probabilities for binary outcomes How would you calibrate these probabilities if you suspect that they are poorly calibrated and why is calibration important'},
# #         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_6.qmd',
# #          'question': 'Logistic regression is based on certain assumptions What are these assumptions and how can violations of these assumptions affect model performance'},
# #         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_4.qmd',
# #          'question': 'How would you incorporate regularization both L1 and L2 into the logistic regression model What effect does regularization have on the model parameters and overall model performance'},
# #         # ... add additional entries as needed
# #     ]

# #     generate_index_qmd(extracted_questions)


In [7]:
import os
import re

def extract_index(file_path):
    """
    Extracts the numerical index from a file name.
    For example, given a file name like "Logistic_Regression_11.qmd",
    it will return 11 as an integer. If no index is found, it returns 0.
    """
    base_name = os.path.basename(file_path)
    match = re.search(r'_(\d+)\.qmd$', base_name, re.IGNORECASE)
    return int(match.group(1)) if match else 0

def generate_index_qmd(extracted_questions, output_file="index.qmd"):
    """
    Generate a Quarto index.qmd file using the extracted_questions list.

    The file groups questions by category and subcategory. It converts an absolute file path like:
        /workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd
    into a relative path like:
        output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd

    Files under each subcategory are sorted in ascending order based on the numerical index
    extracted from the file name (e.g., _11.qmd).

    The index.qmd file will be structured as follows:

    ---
    title: "Data Science interview questions to practice"
    format:
      html:
        toc: true
    ---

    ## classification

    ### Decision_Trees

    - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_0.qmd)
    - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_1.qmd)
    - [Question text](output/quarto_content/classification/Decision_Trees/Decision_Trees_10.qmd)

    ### Logistic_Regression

    - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd)
    - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_6.qmd)
    - [Question text](output/quarto_content/classification/Logistic_Regression/Logistic_Regression_4.qmd)
    """
    # Group the entries by category and subcategory.
    groups = {}
    for entry in extracted_questions:
        file_path = entry['file_path']
        question = entry['question']
        # Convert to relative path: remove the '/workspaces/codespaces-jupyter/' prefix if present.
        base_prefix = "/workspaces/codespaces-jupyter/"
        if file_path.startswith(base_prefix):
            relative_path = file_path[len(base_prefix):]
        else:
            relative_path = file_path

        # Split the path into parts using the OS separator.
        parts = file_path.split(os.sep)
        try:
            # Find where 'quarto_content' is located in the path.
            idx = parts.index("quarto_content")
            # The next two parts should be the category and subcategory.
            category = parts[idx+1] if idx+1 < len(parts) else "Unknown"
            subcategory = parts[idx+2] if idx+2 < len(parts) else "General"
        except ValueError:
            # In case the expected folder name is not found.
            category, subcategory = "Unknown", "General"

        groups.setdefault(category, {}).setdefault(subcategory, []).append({
            "relative_path": relative_path,
            "question": question
        })

    # Build the content for index.qmd
    lines = []
    # YAML front matter
    lines.append("---")
    lines.append('title: "Data Science interview questions to practice"')
    lines.append("format:")
    lines.append("  html:")
    lines.append("    toc: true")
    lines.append("---")
    lines.append("")  # Blank line

    # Create the Markdown sections for each category and subcategory.
    for category in sorted(groups.keys()):
        lines.append(f"## {category}")
        lines.append("")  # Blank line
        for subcategory in sorted(groups[category].keys()):
            lines.append(f"### {subcategory}")
            lines.append("")  # Blank line

            # Sort files based on the numerical index in their filename
            sorted_items = sorted(groups[category][subcategory],
                                  key=lambda item: extract_index(item["relative_path"]))
            for item in sorted_items:
                relative_path = item["relative_path"]
                question_text = item["question"]
                # Use a placeholder if question_text is empty.
                display_text = question_text if question_text else "No question text available"
                # Add the bullet list item.
                lines.append(f"- [{display_text}]({relative_path})")
            lines.append("")  # Blank line after each subcategory
        lines.append("")  # Blank line after each category

    # Write the generated content to the output file.
    with open(output_file, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))

    print(f"Index file generated: {output_file}")

# # Example usage:
# if __name__ == "__main__":
#     # Example list of extracted questions.
#     extracted_questions = [
#         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_11.qmd',
#          'question': 'Logistic regression models produce probabilities for binary outcomes How would you calibrate these probabilities if you suspect that they are poorly calibrated and why is calibration important'},
#         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_6.qmd',
#          'question': 'Logistic regression is based on certain assumptions What are these assumptions and how can violations of these assumptions affect model performance'},
#         {'file_path': '/workspaces/codespaces-jupyter/output/quarto_content/classification/Logistic_Regression/Logistic_Regression_4.qmd',
#          'question': 'How would you incorporate regularization both L1 and L2 into the logistic regression model What effect does regularization have on the model parameters and overall model performance'},
#         # Add additional entries as needed...
#     ]

#     generate_index_qmd(extracted_questions)


In [8]:
generate_index_qmd(extracted_questions, output_file="/workspaces/codespaces-jupyter/index.qmd")

Index file generated: /workspaces/codespaces-jupyter/index.qmd
