In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the Qwen3-30B-A3B model from the specified cache directory
model_name = "Qwen/Qwen3-30B-A3B"
cache_dir = "/fsx-project/rishabhtiwari/hf_cache"

print(f"Loading model: {model_name}")
print(f"Cache directory: {cache_dir}")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True
)

# Load model
from transformers import AutoConfig

# Load config first
config = AutoConfig.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    trust_remote_code=True
)

# Initialize model from config without pretrained weights
model = AutoModelForCausalLM.from_config(
    config,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

# Move to device
# model = model.to("cuda" if torch.cuda.is_available() else "cpu")

print("Model loaded successfully!")
print(f"Model type: {type(model)}")
print(f"Model device: {next(model.parameters()).device}")


In [None]:
from huggingface_hub import hf_hub_download
import json
config_path = hf_hub_download(repo_id="open-thoughts/OpenThoughts3-1.2M", filename="data", repo_type="dataset", cache_dir="/fsx-project/rishabhtiwari/hf_cache")

with open(config_path, 'r') as f:
    config = json.load(f)

print(config)

In [None]:
from datasets import load_dataset
dataset = load_dataset("open-thoughts/OpenThoughts3-1.2M", cache_dir="/fsx-project/rishabhtiwari/hf_cache")

In [None]:
import datasets
dataset = datasets.load_dataset("open-thoughts/OpenThoughts3-1.2M", cache_dir="/fsx-project/rishabhtiwari/hf_cache")
print(dataset)

In [None]:
dataset['train']

In [None]:
# Filter dataset based on source
# First, let's see what sources are available
domains = set(dataset['train']['domain'])
print("Available domains:")
for domain in sorted(domains):
    print(f"  - {domain}")

print(f"\nTotal number of domains: {len(domains)}")

# Example: Filter by a specific source (replace 'your_source' with actual source name)
# filtered_dataset = dataset['train'].filter(lambda x: x['source'] == 'your_source')

# Example: Filter by multiple sources
domains_to_keep = ['math']
filtered_dataset = dataset['train'].filter(lambda x: x['domain'] in domains_to_keep)

# Example: Filter out specific sources
# sources_to_exclude = ['source_to_exclude']
# filtered_dataset = dataset['train'].filter(lambda x: x['source'] not in sources_to_exclude)


In [12]:
dataset['train'] = filtered_dataset

In [None]:
dataset

In [None]:
# Save the filtered dataset
dataset.save_to_disk("/fsx-project/rishabhtiwari/datasets/openthoughts_math_filtered")
print("Dataset saved successfully!")


In [None]:
# Print a few datapoints from the dataset
for i in range(3):
    print(f"=== Datapoint {i+1} ===")
    example = filtered_dataset[i]
    print(f"Difficulty: {example['difficulty']}")
    print(f"Source: {example['source']}")
    print(f"Domain: {example['domain']}")
    print("Conversations:")
    for j, conv in enumerate(example['conversations']):
        print(f"  {j+1}. From: {conv['from']}")
        print(f"     Value: {conv['value'][:200]}{'...' if len(conv['value']) > 200 else ''}")
    print()

