In [15]:
import os

def split_file(file_path, output_dir, chunk_size):
    """
    Splits a file into smaller chunks.
    
    Args:
        file_path (str): Path to the file to be split.
        output_dir (str): Directory where the chunks will be saved.
        chunk_size (int): Size of each chunk in bytes.
    
    Returns:
        None
    """
    try:
        # Ensure the output directory exists
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # Open the file to read in binary mode
        with open(file_path, 'rb') as file:
            chunk_number = 1
            while True:
                # Read a chunk of the file
                chunk = file.read(chunk_size)
                if not chunk:
                    break
                
                # Write the chunk to a new file
                chunk_file_path = os.path.join(output_dir, f"chunk_{chunk_number}")
                with open(chunk_file_path, 'wb') as chunk_file:
                    chunk_file.write(chunk)
                
                print(f"Created: {chunk_file_path}")
                chunk_number += 1

        print(f"File split successfully into {chunk_number - 1} chunks!")
    except Exception as e:
        print(f"An error occurred during file splitting: {e}")

if __name__ == "__main__":
    # Define the file to be split and output directory
    file_to_split = "/Users/roshin/Documents/Slides/templates/mlx/model.safetensors"
    output_directory = "/Users/roshin/Documents/Slides/templates/mlx/output"
    
    # Define the chunk size in bytes (e.g., 10 MB = 10 * 1024 * 1024 bytes)
    chunk_size_in_bytes = 90 * 1024 * 1024  # 10 MB
    
    # Call the function to split the file
    split_file(file_to_split, output_directory, chunk_size_in_bytes)


Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_1
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_2
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_3
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_4
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_5
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_6
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_7
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_8
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_9
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_10
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_11
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_12
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_13
Created: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_14
Created: /Users/roshin/Docume

In [16]:
import os

def merge_chunks(output_file_path, chunks_dir):
    """
    Merges chunks back into the original file.
    
    Args:
        output_file_path (str): Path where the merged file will be saved.
        chunks_dir (str): Directory containing the chunk files to merge.
    
    Returns:
        None
    """
    try:
        # Get a sorted list of all chunk files
        chunk_files = sorted(
            [f for f in os.listdir(chunks_dir) if f.startswith("chunk_")],
            key=lambda x: int(x.split('_')[-1])  # Sort by chunk number
        )
        
        # Merge all chunks into the output file
        with open(output_file_path, 'wb') as output_file:
            for chunk_file in chunk_files:
                chunk_file_path = os.path.join(chunks_dir, chunk_file)
                with open(chunk_file_path, 'rb') as cf:
                    output_file.write(cf.read())
                print(f"Merged: {chunk_file_path}")

        print(f"File merged successfully into: {output_file_path}")
    except Exception as e:
        print(f"An error occurred during file merging: {e}")

if __name__ == "__main__":
    # Define the directory containing the chunks and the output file
    chunks_directory = "/Users/roshin/Documents/Slides/templates/mlx/output"
    merged_file_path = "/Users/roshin/Documents/Slides/templates/mlx/model_merged new.safetensors"
    
    # Call the function to merge chunks
    merge_chunks(merged_file_path, chunks_directory)


Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_1
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_2
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_3
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_4
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_5
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_6
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_7
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_8
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_9
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_10
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_11
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_12
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_13
Merged: /Users/roshin/Documents/Slides/templates/mlx/output/chunk_14
Merged: /Users/roshin/Documents/Slides/temp

In [None]:
from mlx import MLXModel, MLXTokenizer

def load_mlx_llm_model(model_path):
    """
    Loads a local MLX LLM model and tokenizer.
    
    Args:
        model_path (str): Path to the directory containing the MLX model files.
        
    Returns:
        model, tokenizer: Loaded MLX model and tokenizer.
    """
    try:
        # Load the MLX model and tokenizer
        model = MLXModel.from_pretrained(model_path)
        tokenizer = MLXTokenizer.from_pretrained(model_path)
        print("MLX LLM model and tokenizer loaded successfully!")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading MLX LLM model: {e}")
        return None, None

def generate_response(prompt, model, tokenizer, max_length=100):
    """
    Generates a response to a given prompt using the MLX LLM model.
    
    Args:
        prompt (str): The input prompt text.
        model (MLXModel): Loaded MLX model.
        tokenizer (MLXTokenizer): Tokenizer for the MLX model.
        max_length (int): Maximum length of the generated response.
    
    Returns:
        str: The generated response text.
    """
    try:
        # Tokenize the input prompt
        inputs = tokenizer.encode(prompt, return_tensors="pt")
        
        # Generate a response using the model
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            do_sample=True
        )
        
        # Decode the output tokens
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response
    except Exception as e:
        print(f"Error generating response: {e}")
        return None

if __name__ == "__main__":
    # Path to the local MLX model directory
    model_directory = "./mlx_llm_model"  # Replace with the actual path to your model
    
    # Load the MLX LLM model and tokenizer
    mlx_model, mlx_tokenizer = load_mlx_llm_model(model_directory)
    
    if mlx_model and mlx_tokenizer:
        # Example prompt
        example_prompt = "What are the applications of large language models?"
        
        # Generate a response
        response = generate_response(example_prompt, mlx_model, mlx_tokenizer)
        
        if response:
            print("Generated Response:")
            print(response)


In [None]:
from mlx_lm import load, generate
model_path = "mlx-community/Mistral-7B-Instruct-v0.2-4bit"
prompt = prompt_builder("Great content, thank you!")
max_tokens = 140

model, tokenizer = load("mlx-community/Mistral-7B-Instruct-v0.2-4bit")
response = generate(model, tokenizer, prompt=prompt, max_tokens = max_tokens,verbose=True)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from urllib.parse import urljoin

# Step 1: Web Crawler to Scrape Links and Sub-links
def crawl_website(base_url, max_depth=2):
    visited_urls = set()
    content_data = []

    def scrape_page(url, depth):
        if depth > max_depth or url in visited_urls:
            return
        
        print(f"Scraping: {url}")
        visited_urls.add(url)
        
        try:
            response = requests.get(url, timeout=10)
            if response.status_code != 200:
                return
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text content
            page_text = " ".join([p.get_text() for p in soup.find_all('p')])
            if page_text.strip():
                content_data.append({"url": url, "content": page_text.strip()})
            
            # Find all sub-links on the page
            for link in soup.find_all('a', href=True):
                absolute_link = urljoin(url, link['href'])
                # Only follow links within the same domain
                if base_url in absolute_link and absolute_link not in visited_urls:
                    scrape_page(absolute_link, depth + 1)
        
        except Exception as e:
            print(f"Failed to scrape {url}: {e}")

    # Start scraping from the base URL
    scrape_page(base_url, depth=0)
    return content_data

# Step 2: Save Data in LLM Training-Friendly Format
def save_data_to_llm_format(data, output_file="website_data.jsonl"):
    """
    Save data in JSONL format where each line represents a training example:
    {
        "url": "page_url",
        "content": "text_content"
    }
    """
    import json
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')

# Example usage
if __name__ == "__main__":
    base_url = "https://example.com"  # Replace with the website you want to scrape
    max_depth = 2  # Define the depth of crawling

    # Crawl the website
    crawled_data = crawl_website(base_url, max_depth=max_depth)
    print(f"Scraped {len(crawled_data)} pages.")

    # Save data in JSONL format
    save_data_to_llm_format(crawled_data, output_file="training_data.jsonl")
    print("Data saved in LLM training-friendly format: training_data.jsonl")
