In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/transcripts/transcription.txt


In [2]:
!pip install transformers torch accelerate




In [3]:
pip install torch transformers datasets accelerate peft bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
import requests
from bs4 import BeautifulSoup

def extract_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract text from paragraphs and headings
    text = "\n".join([p.get_text() for p in soup.find_all(["p", "h1", "h2", "h3", "li"])])
    return text

# Example usage
urls = [
    "https://www.geeksforgeeks.org/basics-of-computer-programming-for-beginners/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/introduction-to-computer-graphics/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/computer-science-programming-for-kids/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/introduction-to-programming-languages/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/go-programming-language-introduction/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/introduction-to-data-structures/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/introduction-to-algorithms/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/computer-fundamentals-tutorial/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/c-language-introduction/?ref=gcse_outind",
    "https://www.geeksforgeeks.org/basics-of-computer-and-its-operations/?ref=gcse_outind"
]

data = [{"text": extract_text_from_url(url)} for url in urls]

# Save to JSONL
import json
with open("scraped_data.jsonl", "w") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")


In [5]:
import re

def clean_text(text):
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces/newlines
    text = re.sub(r"[^\x00-\x7F]+", "", text)  # Remove non-ASCII characters
    return text.strip()

# Apply to dataset
cleaned_data = [{"text": clean_text(entry["text"])} for entry in data]

# Save cleaned text
with open("cleaned_data.jsonl", "w") as f:
    for entry in cleaned_data:
        f.write(json.dumps(entry) + "\n")


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType
hf_token = "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"

# 🟢 Load Model & Tokenizer with `hf_token`
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Enables mixed precision
    token=hf_token,             # Use Hugging Face authentication token
    device_map="auto"           # Automatically assigns GPUs
)

from peft import get_peft_model, LoraConfig, TaskType

# 🟢 Updated LoRA Configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # GPT-style model
    r=8,                            # LoRA rank (adjust as needed)
    lora_alpha=32,                  # Alpha scaling
    lora_dropout=0.1,                # Dropout to prevent overfitting
    bias="none",                     # No additional biases
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Shows how many parameters are trainable

# 🟢 Load Dataset (Make sure `cleaned_data.jsonl` exists)
dataset = load_dataset("json", data_files="cleaned_data.jsonl")

# Tokenize the Dataset
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Split into training and evaluation sets
split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# 🟢 Training Arguments (Optimized for LoRA)
training_args = TrainingArguments(
    output_dir="./llama-lora-finetuned",
    run_name="llama_lora_experiment",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,   # Reduce if OOM (GPU-dependent)
    per_device_eval_batch_size=2,
    num_train_epochs=3,              # Can be increased for better performance
    learning_rate=2e-4,              # LoRA allows for higher LR
    weight_decay=0.01,
    logging_steps=500,
    fp16=True,
    push_to_hub=False
)

# 🟢 Trainer with LoRA
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Start LoRA Fine-Tuning
trainer.train()

# Save LoRA Fine-Tuned Model
model.save_pretrained("./llama-lora-finetuned")
tokenizer.save_pretrained("./llama-lora-finetuned")


tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

trainable params: 26,214,400 || all params: 9,801,406,480 || trainable%: 0.2675


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

In [None]:
def generate_study_material(subject, topic, purpose, lecture):
    prompt = f"""
    You are an expert educator who simplifies complex topics for students. Your goal is to **generate a detailed, coherent, and structured study material based on the given lecture transcript.**

    **Ensure that your explanation is strictly aligned with the lecture content while also expanding upon key ideas, providing clarity, and using examples to illustrate concepts.**

    ---
    
    ### **Subject:** {subject}  
    ### **Topic:** {topic}  
    ### **Purpose:** {purpose}  

    ---
    
    ### **Guidelines for Output:**
    - **Faithfully follow the lecture transcript** while ensuring clarity and depth.
    - **Explain key concepts in an engaging, structured, and beginner-friendly manner.**
    - **Use real-world examples, analogies, and step-by-step breakdowns** for better understanding.
    - **If relevant, include small code snippets (Python, R, SQL, etc.), diagrams, or formulas.**
    - **Summarize key takeaways at the end** for easy revision.
    
    ---
    
    ### **Lecture Transcript:**  
    {lecture}
    
    ---
    
    ### **Expected Output Format:**  
    1. **Introduction** – Explain the topic in simple terms and its significance.  
    2. **Concept Breakdown** – Follow the lecture's flow, expanding ideas where needed.  
    3. **Real-Life Examples & Analogies** – Make abstract ideas more relatable.  
    4. **Formulas & Problem-Solving Approaches (if applicable)** – Define key rules, methods, or logic.  
    5. **Code Snippets (if relevant)** – Provide Python, R, or SQL examples where helpful.  
    6. **Diagrams or ASCII Illustrations (if applicable)** – Use simple visuals to clarify concepts.  
    7. **Key Takeaways** – A concise summary of the most important points.  
    8. **Practice Questions or Thought-Provoking Exercises** – Help reinforce learning.  
    
    """

    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    output = model.generate(
        **inputs,
        max_new_tokens=4096,  
        temperature=0.7,       
        top_p=0.9,             
        do_sample=True,        
        repetition_penalty=1.1, 
        eos_token_id=tokenizer.eos_token_id  
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Remove potential echoing of the input prompt
    cleaned_output = generated_text.replace(prompt.strip(), "").strip()

    return cleaned_output

# Example Usage
subject = "Computer Science"
topic = "Introduction to Computation and Programming"
purpose = "Convert lecture notes into structured, easy-to-understand study material while staying true to what was taught."

# Read lecture transcript
with open("/kaggle/input/transcripts/transcription.txt", "r", encoding="utf-8") as file:
    lecture = file.read()

# Generate study material
study_material = generate_study_material(subject, topic, purpose, lecture)

# Print the result
print(study_material)
