In [1]:
pip install transformers torch


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import json
import time

# ✅ Load DeepSeek Model
model_name = "deepseek-ai/deepseek-coder-6.7b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

# ✅ Function to Generate Questions with Time Limit & Better Control
def generate_questions_deepseek(chapter_title, chapter_content):
    """Generate review questions using DeepSeek AI model with improved token control."""
    
    # Limit chapter content to prevent slow execution
    truncated_content = chapter_content[:700]  # Keeping it smaller for speed

    prompt = f"""
    Based on the following textbook chapter, generate 5 review questions.

    Chapter Title: {chapter_title}
    Chapter Content:
    {truncated_content}

    Questions:
    """

    # ✅ Tokenize & Limit Tokens to Avoid Freezing
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    
    try:
        # ✅ Run Generation with Timeout (Prevents Hanging)
        start_time = time.time()
        outputs = model.generate(
            **inputs, 
            max_new_tokens=250,  # Slightly increased to ensure full questions
            pad_token_id=tokenizer.eos_token_id,  # Ensures stopping
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Stop if it takes more than 30 seconds per call
        if time.time() - start_time > 30:
            print(f"⚠️ Timeout: Skipping {chapter_title} due to long execution time.")
            return ["Timeout: No questions generated."]

    except Exception as e:
        print(f"⚠️ Error while generating questions for {chapter_title}: {str(e)}")
        return ["Error occurred during generation."]

    # ✅ Decode Response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text.strip().split("\n")

# ✅ Load Structured Textbook Data
with open("structured_textbook.json", "r", encoding="utf-8") as file:
    textbook_data = json.load(file)

# ✅ Generate Questions for Only a Few Chapters at a Time
review_questions = {}
chapter_count = 0  # ✅ Add a counter to limit the number of runs per execution

for chapter, content in textbook_data.items():
    if chapter_count >= 5:  # ✅ Process 5 chapters at a time to prevent overload
        break
    
    questions = generate_questions_deepseek(chapter, " ".join(content))
    review_questions[chapter] = questions
    chapter_count += 1  # ✅ Increment counter

# ✅ Save Generated Questions as JSON
questions_json_path = "deepseek_review_questions.json"
with open(questions_json_path, "w", encoding="utf-8") as json_file:
    json.dump(review_questions, json_file, indent=4)

print(f"✅ Review questions generated and saved to {questions_json_path}")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Review questions generated and saved to deepseek_review_questions.json


In [3]:
# ✅ Function to Generate Summaries with Better Speed Control
def generate_summary_deepseek(chapter_title, chapter_content):
    """Generate a summary for a chapter using DeepSeek AI model with improved efficiency."""
    
    # ✅ Reduce text size to avoid infinite generation
    truncated_content = chapter_content[:900]  # Reduced for speed

    prompt = f"""
    Summarize the following textbook chapter in 3-5 sentences:

    Chapter Title: {chapter_title}
    Chapter Content:
    {truncated_content}

    Summary:
    """

    # ✅ Tokenize & Limit Tokens
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to("cuda")
    
    try:
        # ✅ Run Generation with Timeout
        start_time = time.time()
        outputs = model.generate(
            **inputs, 
            max_new_tokens=150,  # Keep short summaries
            pad_token_id=tokenizer.eos_token_id, 
            eos_token_id=tokenizer.eos_token_id
        )
        
        # Stop if it takes more than 20 seconds
        if time.time() - start_time > 20:
            print(f"⚠️ Timeout: Skipping {chapter_title} due to long execution time.")
            return "Timeout: No summary generated."

    except Exception as e:
        print(f"⚠️ Error while generating summary for {chapter_title}: {str(e)}")
        return "Error occurred during summary generation."

    # ✅ Decode Response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return generated_text.strip()

# ✅ Generate Summaries for Only a Few Chapters at a Time
chapter_summaries = {}
chapter_count = 0  # ✅ Limit processing to 5 chapters per run

for chapter, content in textbook_data.items():
    if chapter_count >= 5:
        break

    summary = generate_summary_deepseek(chapter, " ".join(content))
    chapter_summaries[chapter] = summary
    chapter_count += 1  # ✅ Increment counter

# ✅ Save Generated Summaries as JSON
summaries_json_path = "deepseek_chapter_summaries.json"
with open(summaries_json_path, "w", encoding="utf-8") as json_file:
    json.dump(chapter_summaries, json_file, indent=4)

print(f"✅ Chapter summaries generated and saved to {summaries_json_path}")


✅ Chapter summaries generated and saved to deepseek_chapter_summaries.json


In [6]:
import json
import pandas as pd

# ✅ Load Generated Questions JSON
with open("deepseek_review_questions.json", "r", encoding="utf-8") as file:
    review_questions = json.load(file)

# ✅ Convert to DataFrame for Better Visualization
df = pd.DataFrame([
    {"Chapter": chapter, "Questions": "\n".join(questions)}
    for chapter, questions in review_questions.items()
])

# ✅ Display the first few rows of the DataFrame
print(df.head())  # Shows first 5 rows

# ✅ If using Jupyter Notebook, use this for better visualization
from IPython.display import display
display(df)


                                             Chapter  \
0  ©2023 Rice University. Textbook content produc...   
1  Attribution 4.0 International License (CC BY 4...   
2              1.1 Computing from Inception to Today   
3                 1.2 Computer Hardware and Networks   
4  1.3 The Internet, Cloud Computing, and the Int...   

                                           Questions  
0  Based on the following textbook chapter, gener...  
1  Based on the following textbook chapter, gener...  
2  Based on the following textbook chapter, gener...  
3  Based on the following textbook chapter, gener...  
4  Based on the following textbook chapter, gener...  


Unnamed: 0,Chapter,Questions
0,©2023 Rice University. Textbook content produc...,"Based on the following textbook chapter, gener..."
1,Attribution 4.0 International License (CC BY 4...,"Based on the following textbook chapter, gener..."
2,1.1 Computing from Inception to Today,"Based on the following textbook chapter, gener..."
3,1.2 Computer Hardware and Networks,"Based on the following textbook chapter, gener..."
4,"1.3 The Internet, Cloud Computing, and the Int...","Based on the following textbook chapter, gener..."


In [10]:
import json

# Load generated questions and summaries
questions_file = "deepseek_review_questions.json"
summaries_file = "deepseek_chapter_summaries.json"
output_file = "chatbot_training_data.jsonl"

try:
    with open(questions_file, "r", encoding="utf-8") as file:
        review_questions = json.load(file)

    with open(summaries_file, "r", encoding="utf-8") as file:
        chapter_summaries = json.load(file)

    chatbot_dataset = []

    # Merge questions, answers, and summaries
    for chapter, questions in review_questions.items():
        summary = chapter_summaries.get(chapter, "No summary available.")
        
        for i in range(len(questions) // 2):  # Ensuring questions & answers align
            question = questions[i]
            answer = questions[i + len(questions) // 2] if i + len(questions) // 2 < len(questions) else "No answer available."

            chatbot_dataset.append({
                "chapter": chapter,
                "summary": summary,
                "question": question.strip(),
                "answer": answer.strip()
            })

    # ✅ Save as JSONL for better fine-tuning
    with open(output_file, "w", encoding="utf-8") as jsonl_file:
        for entry in chatbot_dataset:
            jsonl_file.write(json.dumps(entry) + "\n")

    print(f"✅ Chatbot dataset created and saved as {output_file}")

except FileNotFoundError as e:
    print(f"❌ File not found: {e}")
except json.JSONDecodeError:
    print(f"❌ Error decoding JSON.")
except Exception as e:
    print(f"❌ Unexpected error: {str(e)}")


✅ Chatbot dataset created and saved as chatbot_training_data.jsonl


In [18]:
import json

input_file = "deepseek_review_questions.json"

# Read and display the first few lines
with open(input_file, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):
        if i >= 20:  # Limit to 20 lines for preview
            break
        print(f"🔹 Line {i+1}: {line.strip()}")


🔹 Line 1: {
🔹 Line 2: "\u00a92023 Rice University. Textbook content produced by OpenStax is licensed under a Creative Commons": [
🔹 Line 3: "Based on the following textbook chapter, generate 5 review questions.",
🔹 Line 4: "",
🔹 Line 5: "    Chapter Title: \u00a92023 Rice University. Textbook content produced by OpenStax is licensed under a Creative Commons",
🔹 Line 6: "    Chapter Content:",
🔹 Line 7: "    ",
🔹 Line 8: "",
🔹 Line 9: "    Questions:",
🔹 Line 10: "    1. What is the purpose of the chapter?",
🔹 Line 11: "    2. What is the main idea of the chapter?",
🔹 Line 12: "    3. What are the key concepts that the chapter discusses?",
🔹 Line 13: "    4. What are the key takeaways from the chapter?",
🔹 Line 14: "    5. What are the applications of the concepts discussed in the chapter?",
🔹 Line 15: "",
🔹 Line 16: "    Answers:",
🔹 Line 17: "    1. The purpose of the chapter is to provide a comprehensive overview of the topic.",
🔹 Line 18: "    2. The main idea of the chapter is to u

In [20]:
import json
import csv

# File paths
input_file = "deepseek_review_questions.json"
output_file = "cleaned_review_questions.csv"

# Load JSON file
with open(input_file, "r", encoding="utf-8") as file:
    data = json.load(file)  # Load entire JSON object

# Prepare list for cleaned data
cleaned_data = []

# Process each chapter's questions and answers
for chapter, content in data.items():
    if isinstance(content, list):  # Ensure content is a list
        questions = []
        answers = []
        current_section = None

        # Parse the structured text inside the list
        for line in content:
            line = line.strip()

            if line.startswith("Questions:"):
                current_section = "questions"
            elif line.startswith("Answers:"):
                current_section = "answers"
            elif line and current_section == "questions":
                questions.append(line)
            elif line and current_section == "answers":
                answers.append(line)

        # Ensure questions and answers align
        for i in range(min(len(questions), len(answers))):
            cleaned_data.append({
                "chapter": chapter,
                "question": questions[i],
                "answer": answers[i]
            })

# Save cleaned data to CSV
if cleaned_data:
    with open(output_file, "w", encoding="utf-8", newline="") as csv_file:
        fieldnames = ["chapter", "question", "answer"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

        writer.writeheader()
        writer.writerows(cleaned_data)

    print(f"✅ Successfully saved cleaned data to {output_file}")
else:
    print("⚠️ No valid data found after processing.")



✅ Successfully saved cleaned data to cleaned_review_questions.csv


In [22]:
data=pd.read_csv("cleaned_review_questions.csv")
data.head(20)

Unnamed: 0,chapter,question,answer
0,©2023 Rice University. Textbook content produc...,1. What is the purpose of the chapter?,1. The purpose of the chapter is to provide a ...
1,©2023 Rice University. Textbook content produc...,2. What is the main idea of the chapter?,2. The main idea of the chapter is to understa...
2,©2023 Rice University. Textbook content produc...,3. What are the key concepts that the chapter ...,3. The key concepts that the chapter discusses...
3,©2023 Rice University. Textbook content produc...,4. What are the key takeaways from the chapter?,4. The key takeaways from the chapter include ...
4,©2023 Rice University. Textbook content produc...,5. What are the applications of the concepts d...,5. The applications of the concepts discussed ...
5,Attribution 4.0 International License (CC BY 4...,1. What is the purpose of the CC BY 4.0 license?,1. The CC BY 4.0 license allows for the free r...
6,Attribution 4.0 International License (CC BY 4...,2. What are the conditions of the CC BY 4.0 li...,2. The conditions of the CC BY 4.0 license tha...
7,Attribution 4.0 International License (CC BY 4...,"3. What is the purpose of the attribution ""Acc...","3. The purpose of the attribution ""Access for ..."
8,1.1 Computing from Inception to Today,1. What is the first computer that was used pr...,1. IBM
9,1.1 Computing from Inception to Today,2. Name one key company that was a significant...,"2. Hewlett Packard, Xerox, Apple, and Microsoft"


In [24]:
import pandas as pd

# Load the cleaned CSV file
csv_path = "cleaned_review_questions.csv"  # Update this path if needed
df = pd.read_csv(csv_path)

# Check for incomplete or empty answers
incomplete_answers_df = df[df['answer'].isna() | (df['answer'].str.strip() == '')]

# Save flagged rows to a separate CSV file
flagged_csv_path = "flagged_review_questions.csv"
incomplete_answers_df.to_csv(flagged_csv_path, index=False)

# Display flagged rows
print("⚠️ Incomplete Answers Found:")
print(incomplete_answers_df)

print(f"✅ Flagged rows saved to {flagged_csv_path}")


⚠️ Incomplete Answers Found:
Empty DataFrame
Columns: [chapter, question, answer]
Index: []
✅ Flagged rows saved to flagged_review_questions.csv
