In [1]:
import numpy as np
import pandas as pd
from astroquery.sdss import SDSS
import pickle
import warnings
import concurrent.futures
from tqdm import tqdm

In [2]:
# Cell 2: Query SDSS for 100 objects
query = """
SELECT TOP 5000
    s.specobjid, s.plate, s.mjd, s.fiberid, s.class as obj_class,
    s.z as redshift, s.snMedian
FROM SpecObj AS s
WHERE s.class IN ('GALAXY', 'STAR', 'QSO')
    AND s.zWarning = 0
    AND s.snMedian > 5
ORDER BY NEWID()
"""
print("Querying SDSS...")
result = SDSS.query_sql(query)
df = result.to_pandas()
print(f"✓ Retrieved {len(df)} objects")
print(f"Class distribution:\n{df['obj_class'].value_counts()}")


Querying SDSS...
✓ Retrieved 5000 objects
Class distribution:
obj_class
GALAXY    2678
STAR      1606
QSO        716
Name: count, dtype: int64




In [3]:
spectra_list = []

def fetch_spectrum(row):
    try:
        spec = SDSS.get_spectra(plate=row['plate'], mjd=row['mjd'], fiberID=row['fiberid'])[0]
        flux = spec[1].data['flux']
        loglam = spec[1].data['loglam']
        wavelength = 10**loglam

        return {
            'wavelength': wavelength,
            'flux': flux,
            'class': row['obj_class'],
            'redshift': row['redshift'],
            'plate': row['plate'],
            'mjd': row['mjd'],
            'fiberid': row['fiberid']
        }
    except Exception as e:
        return {'error': str(e), 'plate': row['plate'], 'mjd': row['mjd'], 'fiberid': row['fiberid']}

# Run in parallel
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(fetch_spectrum, row) for _, row in df.iterrows()]
    for f in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Downloading spectra"):
        res = f.result()
        if 'error' not in res:
            spectra_list.append(res)

print(f"\n{'='*60}")
print(f"DOWNLOAD COMPLETE: {len(spectra_list)}/{len(df)} spectra successful")
print(f"{'='*60}")

Downloading spectra: 100%|██████████| 5000/5000 [46:37<00:00,  1.79it/s]  


DOWNLOAD COMPLETE: 4950/5000 spectra successful





In [None]:
# Cell 4: Save data
with open('data/sdss_spectra.pkl', 'wb') as f:
    pickle.dump(spectra_list, f)
df.to_csv('data/sdss_metadata.csv', index=False)
print("✓ Saved to sdss_spectra.pkl and sdss_metadata.csv")
print(f"\nFinal class counts:")
classes = [s['class'] for s in spectra_list]
for c in set(classes):
    print(f"  {c}: {classes.count(c)}")

✓ Saved to sdss_spectra.pkl and sdss_metadata.csv

Final class counts:
  GALAXY: 2657
  QSO: 712
  STAR: 1581


In [1]:
import numpy as np
import os

In [5]:
# Load the big array
spectra = np.load('../data/sdss_spectra.npy', allow_pickle=True)

# Check size
print("Total entries:", len(spectra))

# Choose a split index
# (Half is usually fine, but you can choose any split)
mid = len(spectra) // 2

# Split into two parts
part1 = spectra[:mid]
part2 = spectra[mid:]

# Save them
np.save('../data/sdss_spectra_part1.npy', part1)
np.save('../data/sdss_spectra_part2.npy', part2)

# Check resulting file sizes
print("part1:", os.path.getsize('../data/sdss_spectra_part1.npy') / 1e6, "MB")
print("part2:", os.path.getsize('../data/sdss_spectra_part2.npy') / 1e6, "MB")

Total entries: 4950
part1: 81.735345 MB
part2: 81.771625 MB
